def test_export_random_ind(): """Assert that the TPOTClassifier can generate the same pipeline export with random seed of 39.""" tpot_obj = TPOTClassifier(random_state=39, config_dict="TPOT light") tpot_obj._fit_init() tpot_obj._pbar = tqdm(total=1, disable=True) pipeline = tpot_obj._toolbox.individual() expected_code = """import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.naive_bayes import BernoulliNB # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'], random_state=39) exported_pipeline = BernoulliNB(alpha=1.0, fit_prior=False) # Fix random state in exported estimator if hasattr(exported_pipeline, 'random_state'): setattr(exported_pipeline, 'random_state', 39) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """ exported_code = export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset, random_state=tpot_obj.random_state) assert expected_code == exported_code
def test_pipeline_score_save(): """Assert that the TPOTClassifier can generate a scored pipeline export correctly.""" tpot_obj = TPOTClassifier() tpot_obj._fit_init() tpot_obj._pbar = tqdm(total=1, disable=True) pipeline_string = ( 'DecisionTreeClassifier(SelectPercentile(input_matrix, SelectPercentile__percentile=20),' 'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8,' 'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)' ) pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) expected_code = """import numpy as np import pandas as pd from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.tree import DecisionTreeClassifier # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'].values, random_state=None) # Average CV score on the training set was:0.929813743 exported_pipeline = make_pipeline( SelectPercentile(score_func=f_classif, percentile=20), DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """ assert_equal(expected_code, export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset, pipeline_score=0.929813743))
def test_random_ind_2(): """Assert that the TPOTClassifier can generate the same pipeline export with random seed of 45""" tpot_obj = TPOTClassifier(random_state=45) tpot_obj._pbar = tqdm(total=1, disable=True) pipeline = tpot_obj._toolbox.individual() expected_code = """import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from tpot.built_in_operators import ZeroCount # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_classes, testing_classes = \\ train_test_split(features, tpot_data['class'], random_state=42) exported_pipeline = make_pipeline( ZeroCount(), LogisticRegression(C=0.0001, dual=False, penalty="l2") ) exported_pipeline.fit(training_features, training_classes) results = exported_pipeline.predict(testing_features) """ assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)
def test_pipeline_score_save(): """Assert that the TPOTClassifier can generate a scored pipeline export correctly.""" tpot_obj = TPOTClassifier() tpot_obj._fit_init() tpot_obj._pbar = tqdm(total=1, disable=True) pipeline_string = ( 'DecisionTreeClassifier(SelectPercentile(input_matrix, SelectPercentile__percentile=20),' 'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8,' 'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)' ) pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) expected_code = """import numpy as np import pandas as pd from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.tree import DecisionTreeClassifier # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: 0.929813743 exported_pipeline = make_pipeline( SelectPercentile(score_func=f_classif, percentile=20), DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """ assert_equal(expected_code, export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset, pipeline_score=0.929813743))
def test_export_random_ind(): """Assert that the TPOTClassifier can generate the same pipeline export with random seed of 39.""" tpot_obj = TPOTClassifier(random_state=39) tpot_obj._pbar = tqdm(total=1, disable=True) pipeline = tpot_obj._toolbox.individual() expected_code = """import numpy as np import pandas as pd from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.tree import DecisionTreeClassifier # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'].values, random_state=42) exported_pipeline = make_pipeline( SelectPercentile(score_func=f_classif, percentile=65), DecisionTreeClassifier(criterion="gini", max_depth=7, min_samples_leaf=4, min_samples_split=18) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """ print(export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)) assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)
def test_export_random_ind(): """Assert that the TPOTClassifier can generate the same pipeline export with random seed of 39.""" tpot_obj = TPOTClassifier(random_state=39) tpot_obj._pbar = tqdm(total=1, disable=True) pipeline = tpot_obj._toolbox.individual() expected_code = """import numpy as np import pandas as pd from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.tree import DecisionTreeClassifier # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'].values, random_state=42) exported_pipeline = make_pipeline( SelectPercentile(score_func=f_classif, percentile=65), DecisionTreeClassifier(criterion="gini", max_depth=7, min_samples_leaf=4, min_samples_split=18) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """ assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)
def test_gp_new_generation(): """Assert that the gp_generation count gets incremented when _gp_new_generation is called""" tpot_obj = TPOTClassifier() tpot_obj._pbar = tqdm(total=1, disable=True) assert tpot_obj._gp_generation == 0 # Since _gp_new_generation is a decorator, and we dont want to run a full # fit(), decorate a dummy function and then call the dummy function. @_gp_new_generation def dummy_function(self, foo): pass dummy_function(tpot_obj, None) assert tpot_obj._gp_generation == 1
def test_score_2(): """Assert that the TPOTClassifier score function outputs a known score for a fixed pipeline""" tpot_obj = TPOTClassifier() tpot_obj._pbar = tqdm(total=1, disable=True) known_score = 0.986318199045 # Assumes use of the TPOT balanced_accuracy function # Reify pipeline with known score tpot_obj._optimized_pipeline = creator.Individual.\ from_string('RandomForestClassifier(input_matrix)', tpot_obj._pset) tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) tpot_obj._fitted_pipeline.fit(training_features, training_classes) # Get score from TPOT score = tpot_obj.score(testing_features, testing_classes) # http://stackoverflow.com/questions/5595425/ def isclose(a, b, rel_tol=1e-09, abs_tol=0.0): return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) assert isclose(known_score, score)
def test_score_2(): """Assert that the TPOTClassifier score function outputs a known score for a fixed pipeline""" tpot_obj = TPOTClassifier() tpot_obj._pbar = tqdm(total=1, disable=True) known_score = 0.986318199045 # Assumes use of the TPOT balanced_accuracy function # Reify pipeline with known score tpot_obj._optimized_pipeline = creator.Individual.\ from_string('RandomForestClassifier(input_matrix)', tpot_obj._pset) tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile( expr=tpot_obj._optimized_pipeline) tpot_obj._fitted_pipeline.fit(training_features, training_classes) # Get score from TPOT score = tpot_obj.score(testing_features, testing_classes) # http://stackoverflow.com/questions/5595425/ def isclose(a, b, rel_tol=1e-09, abs_tol=0.0): return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) assert isclose(known_score, score)
def test_export_random_ind(): """Assert that the TPOTClassifier can generate the same pipeline export with random seed of 39.""" tpot_obj = TPOTClassifier(random_state=39, config_dict="TPOT light") tpot_obj._fit_init() tpot_obj._pbar = tqdm(total=1, disable=True) pipeline = tpot_obj._toolbox.individual() expected_code = """import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.naive_bayes import BernoulliNB # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'].values, random_state=39) exported_pipeline = BernoulliNB(alpha=1.0, fit_prior=False) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """ assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset, random_state=tpot_obj.random_state)