def test_multinomial_nb(): """Ensure that the TPOT MultinomialNB outputs the same as the sklearn MultinomialNB""" tpot_obj = TPOT() result = tpot_obj._multinomial_nb(training_testing_data, 1.0) result = result[result['group'] == 'testing'] mnb = MultinomialNB(alpha=1.0, fit_prior=True) mnb.fit(training_features, training_classes) assert np.array_equal(result['guess'].values, mnb.predict(testing_features))
def test_variance_threshold(): """Ensure that the tpot variance_threshold function behaves the same as the sklearn classifier""" tpot_obj = TPOT() non_feature_columns = ['class', 'group', 'guess'] training_features = training_testing_data.loc[training_testing_data['group'] == 'training'].drop(non_feature_columns, axis=1) selector = VarianceThreshold(threshold=0) selector.fit(training_features) mask = selector.get_support(True) mask_cols = list(training_features.iloc[:, mask].columns) + non_feature_columns assert np.array_equal(tpot_obj._variance_threshold(training_testing_data, 0), training_testing_data[mask_cols])
def test_logistic_regression(): """Ensure that the TPOT logistic regression classifier outputs the same as the sklearn LogisticRegression""" tpot_obj = TPOT() result = tpot_obj._logistic_regression(training_testing_data, 5., 0, True) result = result[result['group'] == 'testing'] lrc = LogisticRegression(C=5., penalty='l1', dual=False, random_state=42) lrc.fit(training_features, training_classes) assert np.array_equal(result['guess'].values, lrc.predict(testing_features))
def test_knnc_2(): """Ensure that the TPOT k-nearest neighbor classifier outputs the same as the sklearn classifier when n_neighbor=0""" tpot_obj = TPOT() result = tpot_obj._knnc(training_testing_data, 0, 0) result = result[result['group'] == 'testing'] knnc = KNeighborsClassifier(n_neighbors=2, weights='uniform') knnc.fit(training_features, training_classes) assert np.array_equal(result['guess'].values, knnc.predict(testing_features))
def test_extra_trees_3(): """Ensure that the TPOT ExtraTreesClassifier outputs the same as the sklearn version when min_weight > 0.5""" tpot_obj = TPOT() result = tpot_obj._extra_trees(training_testing_data, 0, 1., 0.6) result = result[result['group'] == 'testing'] etc = ExtraTreesClassifier(n_estimators=500, random_state=42, max_features=1., min_weight_fraction_leaf=0.5, criterion='gini') etc.fit(training_features, training_classes) assert np.array_equal(result['guess'].values, etc.predict(testing_features))
def test_random_forest_2(): """Ensure that the TPOT random forest method outputs the same as the sklearn random forest when min_weight>0.5""" tpot_obj = TPOT() result = tpot_obj._random_forest(training_testing_data, 0.6) result = result[result['group'] == 'testing'] rfc = RandomForestClassifier(n_estimators=500, min_weight_fraction_leaf=0.5, random_state=42, n_jobs=-1) rfc.fit(training_features, training_classes) assert np.array_equal(result['guess'].values, rfc.predict(testing_features))
def test_combine_dfs_2(): """Check combine_dfs operator when the dataframes are equal""" tpot_obj = TPOT() df1 = pd.DataFrame({'a': range(10), 'b': range(10, 20)}) df2 = pd.DataFrame({'a': range(10), 'b': range(10, 20)}) combined_df = pd.DataFrame({'a': range(10), 'b': range(10, 20)}) assert tpot_obj._combine_dfs(df1, df2).equals(combined_df)
def test_bernoulli_nb(): """Ensure that the TPOT BernoulliNB outputs the same as the sklearn BernoulliNB""" tpot_obj = TPOT() result = tpot_obj._bernoulli_nb(training_testing_data, 1.0, 0.0) result = result[result['group'] == 'testing'] bnb = BernoulliNB(alpha=1.0, binarize=0.0, fit_prior=True) bnb.fit(training_features, training_classes) assert np.array_equal(result['guess'].values, bnb.predict(testing_features))
def test_gradient_boosting_2(): """Ensure that the TPOT GradientBoostingClassifier outputs the same as the sklearn classifier when max_depth < 1""" tpot_obj = TPOT() result = tpot_obj._gradient_boosting(training_testing_data, 1.0, 0) result = result[result['group'] == 'testing'] gbc = GradientBoostingClassifier(learning_rate=1.0, max_depth=1, n_estimators=500, random_state=42) gbc.fit(training_features, training_classes) assert np.array_equal(result['guess'].values, gbc.predict(testing_features))
def test_passive_aggressive_2(): """Ensure that the TPOT PassiveAggressiveClassifier outputs the same as the sklearn classifier when C == 0.0""" tpot_obj = TPOT() result = tpot_obj._passive_aggressive(training_testing_data, 0.0, 0) result = result[result['group'] == 'testing'] pagg = PassiveAggressiveClassifier(C=0.0001, loss='hinge', fit_intercept=True, random_state=42) pagg.fit(training_features, training_classes) assert np.array_equal(result['guess'].values, pagg.predict(testing_features))
def test_linear_svc(): """Ensure that the TPOT LinearSVC outputs the same as the sklearn LinearSVC""" tpot_obj = TPOT() result = tpot_obj._linear_svc(training_testing_data, 1.0, 0, True) result = result[result['group'] == 'testing'] lsvc = LinearSVC(C=1.0, loss='hinge', fit_intercept=True, random_state=42) lsvc.fit(training_features, training_classes) assert np.array_equal(result['guess'].values, lsvc.predict(testing_features))
def test_svc_2(): """Ensure that the TPOT random forest method outputs the same as the sklearn svc when C<0.0001""" tpot_obj = TPOT() result = tpot_obj._svc(training_testing_data, 0.00001) result = result[result['group'] == 'testing'] svc = SVC(C=0.0001, random_state=42) svc.fit(training_features, training_classes) assert np.array_equal(result['guess'].values, svc.predict(testing_features))
def test_random_forest_3(): """Ensure that the TPOT random forest method outputs the same as the sklearn random forest when max_features>no. of features""" tpot_obj = TPOT() result = tpot_obj._random_forest(training_testing_data, 100) result = result[result['group'] == 'testing'] rfc = RandomForestClassifier(n_estimators=500, max_features=64, random_state=42, n_jobs=-1) rfc.fit(training_features, training_classes) assert np.array_equal(result['guess'].values, rfc.predict(testing_features))
def test_decision_tree_3(): """Ensure that the TPOT decision tree method outputs the same as the sklearn decision tree when max_features>no. of features""" tpot_obj = TPOT() result = tpot_obj._decision_tree(training_testing_data, 100, 0) result = result[result['group'] == 'testing'] dtc = DecisionTreeClassifier(max_features=64, max_depth=None, random_state=42) dtc.fit(training_features, training_classes) assert np.array_equal(result['guess'].values, dtc.predict(testing_features))
def test_linear_svc_2(): """Ensure that the TPOT LinearSVC outputs the same as the sklearn LinearSVC when C == 0.0""" tpot_obj = TPOT() result = tpot_obj._linear_svc(training_testing_data, 0.0, 0, True) result = result[result['group'] == 'testing'] lsvc = LinearSVC(C=0.0001, penalty='l1', dual=False, random_state=42) lsvc.fit(training_features, training_classes) assert np.array_equal(result['guess'].values, lsvc.predict(testing_features))
def test_ada_boost_2(): """Ensure that the TPOT AdaBoostClassifier outputs the same as the sklearn classifer when learning_rate == 0.0""" tpot_obj = TPOT() result = tpot_obj._ada_boost(training_testing_data, 0.0) result = result[result['group'] == 'testing'] adaboost = AdaBoostClassifier(n_estimators=500, random_state=42, learning_rate=0.0001) adaboost.fit(training_features, training_classes) assert np.array_equal(result['guess'].values, adaboost.predict(testing_features))
def test_decision_tree_3(): """Ensure that the TPOT decision tree method outputs the same as the sklearn decision tree when min_weight>0.5""" tpot_obj = TPOT() result = tpot_obj._decision_tree(training_testing_data, 0.6) result = result[result['group'] == 'testing'] dtc = DecisionTreeClassifier(min_weight_fraction_leaf=0.5, random_state=42) dtc.fit(training_features, training_classes) assert np.array_equal(result['guess'].values, dtc.predict(testing_features))
def test_gaussian_nb(): """Ensure that the TPOT GaussianNB outputs the same as the sklearn GaussianNB""" tpot_obj = TPOT() result = tpot_obj._gaussian_nb(training_testing_data) result = result[result['group'] == 'testing'] gnb = GaussianNB() gnb.fit(training_features, training_classes) assert np.array_equal(result['guess'].values, gnb.predict(testing_features))
def test_get_params(): """Assert that get_params returns the exact dictionary of parameters used by TPOT""" kwargs = {'population_size': 500, 'generations': 1000, 'verbosity': 1} tpot_obj = TPOT(**kwargs) # Get default parameters of TPOT and merge with our specified parameters initializer = inspect.getargspec(TPOT.__init__) default_kwargs = dict(zip(initializer.args[1:], initializer.defaults)) default_kwargs.update(kwargs) assert tpot_obj.get_params() == default_kwargs
def test_df_feature_selection(): tpot_obj = TPOT() top_10_feature_pairs = [ '00002', '00013', '00020', '00021', '00026', '00042', '00043', '00058', '00061', 'class', 'group', 'guess' ] assert np.array_equal( tpot_obj._dt_feature_selection(training_testing_data, 10).columns.values, top_10_feature_pairs)
def test_predict_2(): """Ensure that the TPOT predict function returns a DataFrame of shape (num_testing_rows,)""" tpot_obj = TPOT() tpot_obj._training_classes = training_classes tpot_obj._training_features = training_features tpot_obj._optimized_pipeline = creator.Individual.\ from_string('_logistic_regression(input_df, 1.0, 0, True)', tpot_obj._pset) result = tpot_obj.predict(testing_features) assert result.shape == (testing_features.shape[0], )
def test_train_model_and_predict(): """Ensure that the TPOT train_model_and_predict returns the input dataframe when it has only 3 columns i.e. class, group, guess""" tpot_obj = TPOT() assert np.array_equal( training_testing_data.ix[:, -3:], tpot_obj._train_model_and_predict(training_testing_data.ix[:, -3:], LinearSVC, C=5., penalty='l1', dual=False))
def test_static_models(): ''' Ensure that the TPOT static classifiers outputs the same as the sklearn output ''' tpot_obj = TPOT() models = [(tpot_obj.decision_tree, DecisionTreeClassifier, { 'max_features': 0, 'max_depth': 0 }, { 'max_features': 'auto', 'max_depth': None }), (tpot_obj.svc, SVC, { 'C': 0.0001 }, { 'C': 0.0001 }), (tpot_obj.random_forest, RandomForestClassifier, { 'n_estimators': 100, 'max_features': 0 }, { 'n_estimators': 100, 'max_features': 'auto', 'n_jobs': -1 }), (tpot_obj.logistic_regression, LogisticRegression, { 'C': 0.0001 }, { 'C': 0.0001 }), (tpot_obj.knnc, KNeighborsClassifier, { 'n_neighbors': 100 }, { 'n_neighbors': 100 })] for model, sklearn_model, model_params, sklearn_params in models: result = model(training_testing_data, **model_params) try: sklearn_model_obj = sklearn_model(random_state=42, **sklearn_params) sklearn_model_obj.fit(training_features, training_classes) except TypeError: sklearn_model_obj = sklearn_model(**sklearn_params) sklearn_model_obj.fit(training_features, training_classes) result = result[result['group'] == 'testing'] assert np.array_equal(result['guess'].values, sklearn_model_obj.predict( testing_features)), "Model {} failed".format( str(model))
def test_unroll_nested(): """Ensure that export utils' unroll_nested_fuction_calls outputs pipeline_list as expected""" tpot_obj = TPOT() expected_list = [['result1', '_logistic_regression', 'input_df', '1.0', '0', 'True']] pipeline = creator.Individual.\ from_string('_logistic_regression(input_df, 1.0, 0, True)', tpot_obj._pset) pipeline_list = unroll_nested_fuction_calls(pipeline) assert expected_list == pipeline_list
def test_predict_2(): """Assert that the TPOT predict function returns a numpy matrix of shape (num_testing_rows,)""" tpot_obj = TPOT() tpot_obj._optimized_pipeline = creator.Individual.\ from_string('DecisionTreeClassifier(input_matrix)', tpot_obj._pset) tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile( expr=tpot_obj._optimized_pipeline) tpot_obj._fitted_pipeline.fit(training_features, training_classes) result = tpot_obj.predict(testing_features) assert result.shape == (testing_features.shape[0], )
def test_replace_function_calls_2(): """Ensure export utils' replace_function_calls generates no exceptions""" tpot_obj = TPOT() for prim in tpot_obj._pset.primitives[pd.DataFrame]: simple_pipeline = ['result1'] simple_pipeline.append(prim.name) for arg in prim.args: simple_pipeline.append(tpot_obj._pset.terminals[arg][0].value) replace_function_calls([simple_pipeline])
def test_rfe_2(): """Ensure that the TPOT RFE outputs the same result as the sklearn rfe when num_features>no. of features in the dataframe """ tpot_obj = TPOT() non_feature_columns = ['class', 'group', 'guess'] training_features = training_testing_data.loc[training_testing_data['group'] == 'training'].drop(non_feature_columns, axis=1) estimator = LinearSVC() rfe = RFE(estimator, 100, step=0.1) rfe.fit(training_features, training_classes) mask = rfe.get_support(True) mask_cols = list(training_features.iloc[:, mask].columns) + non_feature_columns assert np.array_equal(training_testing_data[mask_cols], tpot_obj._rfe(training_testing_data, 64, 0.1))
def test_unroll_nested_2(): """Ensure that export utils' unroll_nested_fuction_calls outputs pipelines with nested function calls as expectd""" tpot_obj = TPOT() expected_list = [['result1', '_select_percentile', 'input_df', '40'], ['result2', '_extra_trees', 'result1', '32', '0.62', '0.45']] pipeline = creator.Individual.\ from_string('_extra_trees(_select_percentile(input_df, 40), 32, 0.62, 0.45000000000000001)', tpot_obj._pset) pipeline_list = unroll_nested_fuction_calls(pipeline) assert expected_list == pipeline_list
def test_init(): """Ensure that the TPOT instantiator stores the TPOT variables properly""" tpot_obj = TPOT(population_size=500, generations=1000, mutation_rate=0.05, crossover_rate=0.9, verbosity=1) assert tpot_obj.population_size == 500 assert tpot_obj.generations == 1000 assert tpot_obj.mutation_rate == 0.05 assert tpot_obj.crossover_rate == 0.9 assert tpot_obj.verbosity == 1
def test_combine_dfs(): tpot_obj = TPOT() df1 = pd.DataFrame({'a': range(10), 'b': range(10, 20)}) df2 = pd.DataFrame({'b': range(10, 20), 'c': range(20, 30)}) combined_df = pd.DataFrame({ 'a': range(10), 'b': range(10, 20), 'c': range(20, 30) }) assert tpot_obj.combine_dfs(df1, df2).equals(combined_df)