def test_theanets_partial_fit(): clf_complete = TheanetsClassifier(layers=[2], trainers=[{ 'algo': 'rmsprop', 'learning_rate': 0.1 }, { 'algo': 'rprop', 'learning_rate': 0.1 }]) clf_partial = TheanetsClassifier(layers=[2], trainers=[{ 'algo': 'rmsprop', 'learning_rate': 0.1 }]) X, y, sample_weight = generate_classification_data() clf_complete.fit(X, y) clf_partial.fit(X, y) clf_partial.partial_fit(X, y, algo='rprop', learning_rate=0.1) assert clf_complete.trainers == clf_partial.trainers, 'trainers not saved in partial fit' auc_complete = roc_auc_score(y, clf_complete.predict_proba(X)[:, 1]) auc_partial = roc_auc_score(y, clf_partial.predict_proba(X)[:, 1]) # Known fail of theanets assert auc_complete == auc_partial, 'same networks return different results'
def test_cache_classifier(): cache_helper.clear_cache() for Wrapper, Model in [(CacheClassifier, LogisticRegression), (CacheRegressor, LinearRegression)]: X, y, weights = generate_classification_data(n_classes=2) clf = Wrapper('first', Model()).fit(X, y) assert clf._used_cache == False clf = Wrapper('first', Model()).fit(X + 0, y + 0) assert clf._used_cache == True # changed name clf = Wrapper('second', Model()).fit(X, y) assert clf._used_cache == False # changed data X_new = X.copy() X_new.iloc[0, 0] += 1 clf = Wrapper('first', Model()).fit(X_new, y) assert clf._used_cache == False # changed labels y_new = y.copy() y_new[0] += 1 clf = Wrapper('first', Model()).fit(X, y_new) assert clf._used_cache == False # added weights clf = Wrapper('first', Model()).fit(X, y, sample_weight=None) assert clf._used_cache == False # changed parameters clf = Wrapper('first', Model(n_jobs=2)).fit(X, y) assert clf._used_cache == False # fitting previous once again. Checking that overwriting is correct. clf = Wrapper('first', Model(n_jobs=2)).fit(X, y) assert clf._used_cache == True cache_helper.clear_cache()
def very_basic_xgboost_test(): X, y, w = generate_classification_data(n_classes=2) clf = XGBoostClassifier(n_estimators=10).fit(X, y) clf.predict(X) clf.predict_proba(X) # testing that returned features in importances are correct and in the same order assert numpy.all(clf.features == clf.get_feature_importances().index)
def test_basic_xgboost(): X, y, w = generate_classification_data(n_classes=2) clf = XGBoostClassifier(n_estimators=10).fit(X, y) clf.predict(X) clf.predict_proba(X) # testing that returned features in importances are correct and in the same order assert numpy.all(clf.features == clf.get_feature_importances().index)
def test_gridsearch_metrics_threads(n_threads=3): X, y, sample_weight = generate_classification_data(n_classes=2, distance=0.7) param_grid = OrderedDict({'reg_param': numpy.linspace(0, 1, 20)}) from itertools import cycle optimizers = cycle([ RegressionParameterOptimizer(param_grid=param_grid, n_evaluations=4, start_evaluations=2), SubgridParameterOptimizer(param_grid=param_grid, n_evaluations=4), RandomParameterOptimizer(param_grid=param_grid, n_evaluations=4), ]) for metric in [RocAuc(), OptimalAMS(), OptimalSignificance(), log_loss]: scorer = FoldingScorer(metric) clf = SklearnClassifier(QDA()) grid = GridOptimalSearchCV( estimator=clf, params_generator=next(optimizers), scorer=scorer, parallel_profile='threads-{}'.format(n_threads)) grid.fit(X, y) print(grid.params_generator.best_score_) print(grid.params_generator.best_params_) grid.params_generator.print_results()
def check_folding(classifier, check_instance=True, has_staged_pp=True, has_importances=True): X, y, sample_weight = generate_classification_data(distance=0.6) assert classifier == classifier.fit(X, y, sample_weight=sample_weight) assert list(classifier.features) == list(X.columns) check_classification_model(classifier, X, y, check_instance=check_instance, has_staged_pp=has_staged_pp, has_importances=has_importances) def mean_vote(x): return numpy.mean(x, axis=0) labels = classifier.predict(X, mean_vote) proba = classifier.predict_proba(X, mean_vote) assert numpy.all(proba == classifier.predict_proba(X, mean_vote)) score = accuracy_score(y, labels) print(score) assert score > 0.7 assert numpy.allclose(proba.sum(axis=1), 1), 'probabilities do not sum to 1' assert numpy.all(proba >= 0.), 'negative probabilities' auc_score = roc_auc_score(y, proba[:, 1]) print(auc_score) assert auc_score > 0.8 if has_staged_pp: for p in classifier.staged_predict_proba(X, mean_vote): assert p.shape == (len(X), 2) # checking that last iteration coincides with previous assert numpy.all(p == proba)
def test_xgboost_random_states(): X, y, weights = generate_classification_data(n_classes=2, distance=5) for random_state in [ 145, None, check_random_state(None), check_random_state(145) ]: clf1 = XGBoostClassifier(n_estimators=5, max_depth=1, subsample=0.1, random_state=random_state) clf1.fit(X, y) clf2 = XGBoostClassifier(n_estimators=5, max_depth=1, subsample=0.1, random_state=random_state) clf2.fit(X, y) if isinstance(random_state, numpy.random.RandomState): assert not numpy.allclose( clf1.predict_proba(X), clf2.predict_proba(X)), 'seed: {}'.format(random_state) else: assert numpy.allclose( clf1.predict_proba(X), clf2.predict_proba(X)), 'seed: {}'.format(random_state)
def test_xgboost_works_with_different_dtypes(): dtypes = ['float32', 'float64', 'int32', 'int64', 'uint32'] for dtype in dtypes: X, y, weights = generate_classification_data(n_classes=2, distance=5) clf = XGBoostClassifier(n_estimators=10) clf.fit(X.astype(dtype=dtype), y.astype(dtype=dtype), sample_weight=weights.astype(dtype)) probabilities = clf.predict_proba(X.astype(dtype)) # testing single pandas.DataFrame with different dtypes X, y, weights = generate_classification_data(n_classes=2, distance=5) import pandas X = pandas.DataFrame() for dtype in dtypes: X[dtype] = numpy.random.normal(0, 10, size=len(y)).astype(dtype) clf = XGBoostClassifier(n_estimators=10) clf.fit(X, y, sample_weight=weights) probabilities = clf.predict_proba(X)
def test_theanets_reproducibility(): clf = TheanetsClassifier(trainers=[{ 'algo': 'nag', 'min_improvement': 0.1, 'max_updates': 10 }]) X, y, _ = generate_classification_data() check_classification_reproducibility(clf, X, y)
def test_pybrain_reproducibility(): # This test fails. Because PyBrain can't reproduce training. X, y, _ = generate_classification_data() clf1 = PyBrainClassifier(layers=[4], epochs=2).fit(X, y) clf2 = PyBrainClassifier(layers=[4], epochs=2).fit(X, y) print(clf1.predict_proba(X) - clf2.predict_proba(X)) assert numpy.allclose(clf1.predict_proba(X), clf2.predict_proba(X)), 'different predicitons' check_classification_reproducibility(clf1, X, y)
def test_nolearn_reproducibility(): X, y, sample_weight = generate_classification_data() cl = NolearnClassifier() y_predicted_1 = cl.fit(X, y).predict(X) y_predicted_2 = cl.fit(X, y).predict(X) assert (y_predicted_1 == y_predicted_2).all(), 'fitting the classifier twice gives different predictions' y_predicted_3 = clone(cl).fit(X, y).predict(X) assert (y_predicted_1 == y_predicted_3).all(), 'cloned classifier gives different prediction'
def test_factory(): factory = RegressorsFactory() try: from rep.estimators.tmva import TMVARegressor factory.add_regressor('tmva', TMVARegressor()) except ImportError: pass factory.add_regressor('rf', RandomForestRegressor(n_estimators=10)) factory.add_regressor('ada', AdaBoostRegressor(n_estimators=20)) X, y, sample_weight = generate_classification_data() assert factory == factory.fit(X, y, sample_weight=sample_weight, features=list(X.columns)) values = factory.predict(X) for cl in factory.values(): assert list(cl.features) == list(X.columns) for key, val in values.items(): score = mean_squared_error(y, val) print(score) assert score < 0.2 for key, iterator in factory.staged_predict(X).items(): assert key != 'tmva', 'tmva does not support staged pp' for p in iterator: assert p.shape == (len(X), ) # checking that last iteration coincides with previous assert numpy.all(p == values[key]) # testing picklability dump_string = cPickle.dumps(factory) clf_loaded = cPickle.loads(dump_string) assert type(factory) == type(clf_loaded) probs1 = factory.predict(X) probs2 = clf_loaded.predict(X) for key, val in probs1.items(): assert numpy.all(val == probs2[key]), 'something strange was loaded' report = RegressionReport({'rf': factory['rf']}, LabeledDataStorage(X, y, sample_weight)) report.feature_importance_shuffling(mean_squared_mod).plot(new_plot=True, figsize=(18, 3)) report = factory.test_on_lds(LabeledDataStorage(X, y, sample_weight)) report = factory.test_on(X, y, sample_weight=sample_weight) report.feature_importance() report.features_correlation_matrix() report.predictions_scatter() val = numpy.mean(X['column0']) report_mask(report, "column0 > %f" % val, X) report_mask(report, lambda x: numpy.array(x['column0']) < val, X) report_mask(report, None, X)
def test_factory(): factory = ClassifiersFactory() try: from rep.estimators.tmva import TMVAClassifier factory.add_classifier('tmva', TMVAClassifier()) except ImportError: pass factory.add_classifier('rf', RandomForestClassifier(n_estimators=10)) factory.add_classifier('ada', AdaBoostClassifier(n_estimators=20)) X, y, sample_weight = generate_classification_data() assert factory == factory.fit(X, y, sample_weight=sample_weight, features=list(X.columns), parallel_profile='threads-4') for cl in factory.values(): assert list(cl.features) == list(X.columns) proba = factory.predict_proba(X, parallel_profile='threads-4') labels = factory.predict(X, parallel_profile='threads-4') for key, val in labels.items(): score = accuracy_score(y, val) print(key, score) assert score > 0.7, key for key, val in proba.items(): assert numpy.allclose(val.sum(axis=1), 1), 'probabilities do not sum to 1' assert numpy.all(val >= 0.), 'negative probabilities' auc_score = roc_auc_score(y, val[:, 1]) print(auc_score) assert auc_score > 0.8 for key, iterator in factory.staged_predict_proba(X).items(): assert key != 'tmva', 'tmva does not support staged pp' for p in iterator: assert p.shape == (len(X), 2) # checking that last iteration coincides with previous assert numpy.all(p == proba[key]) # testing picklability dump_string = cPickle.dumps(factory) clf_loaded = cPickle.loads(dump_string) assert type(factory) == type(clf_loaded) probs1 = factory.predict_proba(X) probs2 = clf_loaded.predict_proba(X) for key, val in probs1.items(): assert numpy.all(val == probs2[key]), 'something strange was loaded' report = ClassificationReport({'rf': factory['rf']}, LabeledDataStorage(X, y, sample_weight)) report.feature_importance_shuffling(roc_auc_score_mod).plot(new_plot=True, figsize=(18, 3)) report = factory.test_on_lds(LabeledDataStorage(X, y, sample_weight)) report = factory.test_on(X, y, sample_weight=sample_weight) val = numpy.mean(X['column0']) yield check_report_with_mask, report, "column0 > %f" % (val / 2.), X yield check_report_with_mask, report, lambda x: numpy.array(x['column0']) < val * 2., X yield check_report_with_mask, report, None, X
def test_theanets_reproducibility(): clf = TheanetsClassifier(trainers=[{ 'algo': 'nag', 'min_improvement': 0.1 }]) X, y, _ = generate_classification_data() import numpy numpy.random.seed(43) check_classification_reproducibility(clf, X, y)
def test_xgboost_feature_importance(): X, y, weights = generate_classification_data(n_classes=2, distance=5) clf = XGBoostClassifier(n_estimators=1, max_depth=1) clf.fit(X, y) importances = clf.get_feature_importances() original_features = set(X.columns) importances_features = set(importances.index) print(original_features, importances_features) assert original_features == importances_features, 'feature_importances_ return something wrong' assert len(original_features) == len(clf.feature_importances_)
def test_pretrain(): trainX, trainY, _ = generate_classification_data() trainers = [{ 'algo': 'pretrain', 'learning_rate': 0.5, 'patience': 1, 'validate_every': 1 }] # only checking that fitting doesn't throw errors # this frequently gets stuck on CI TheanetsClassifier(layers=[5], trainers=trainers).fit(trainX, trainY)
def test_xgboost_random_states(): X, y, weights = generate_classification_data(n_classes=2, distance=5) for random_state in [145, None, check_random_state(None), check_random_state(145)]: clf1 = XGBoostClassifier(n_estimators=5, max_depth=1, subsample=0.1, random_state=random_state) clf1.fit(X, y) clf2 = XGBoostClassifier(n_estimators=5, max_depth=1, subsample=0.1, random_state=random_state) clf2.fit(X, y) if isinstance(random_state, numpy.random.RandomState): assert not numpy.allclose(clf1.predict_proba(X), clf2.predict_proba(X)), 'seed: {}'.format(random_state) else: assert numpy.allclose(clf1.predict_proba(X), clf2.predict_proba(X)), 'seed: {}'.format(random_state)
def test_own_classification_reports(): """ testing clf.test_on """ X, y, sample_weight = generate_classification_data() clf = SklearnClassifier(RandomForestClassifier()) clf.fit(X, y, sample_weight=sample_weight) report = clf.test_on(X, y, sample_weight=sample_weight) roc1 = report.compute_metric(RocAuc()) lds = LabeledDataStorage(X, y, sample_weight=sample_weight) roc2 = clf.test_on_lds(lds=lds).compute_metric(RocAuc()) assert roc1 == roc2, 'Something wrong with test_on'
def test_gridsearch_threads(n_threads=3): scorer = FoldingScorer(numpy.random.choice([OptimalAMS(), RocAuc()])) grid_param = OrderedDict({"n_estimators": [10, 20], "learning_rate": [0.1, 0.05], 'features': [['column0', 'column1'], ['column0', 'column1', 'column2']]}) generator = RegressionParameterOptimizer(grid_param, n_evaluations=4) base = SklearnClassifier(clf=AdaBoostClassifier()) grid = GridOptimalSearchCV(base, generator, scorer, parallel_profile='threads-{}'.format(n_threads)) X, y, sample_weight = generate_classification_data() grid.fit(X, y, sample_weight=sample_weight)
def test_theanets_reproducibility(): clf = TheanetsClassifier() X, y, sample_weight = generate_classification_data() clf.fit(X, y) auc = roc_auc_score(y, clf.predict_proba(X)[:, 1]) for i in range(2): clf.fit(X, y) curr_auc = roc_auc_score(y, clf.predict_proba(X)[:, 1]) assert auc == curr_auc, 'running a network twice produces different results' cloned_clf = clone(clf) cloned_clf.fit(X, y) cloned_auc = roc_auc_score(y, cloned_clf.predict_proba(X)[:, 1]) assert cloned_auc == auc, 'cloned network produces different result'
def test_theanets_partial_fit(): clf_complete = TheanetsClassifier(trainers=[{'optimize': 'rmsprop', 'patience': 0}, {'optimize': 'rprop'}]) clf_partial = TheanetsClassifier(trainers=[{'optimize': 'rmsprop', 'patience': 0}]) X, y, sample_weight = generate_classification_data() clf_complete.fit(X, y) clf_partial.fit(X, y) clf_partial.partial_fit(X, y, optimize='rprop') assert clf_complete.trainers == clf_partial.trainers, 'trainers not saved in partial fit' auc_complete = roc_auc_score(y, clf_complete.predict_proba(X)[:, 1]) auc_partial = roc_auc_score(y, clf_partial.predict_proba(X)[:, 1]) assert auc_complete == auc_partial, 'same networks return different results'
def test_feature_importances(): clf = XGBoostClassifier() X, y, sample_weight = generate_classification_data() clf.fit(X, y, sample_weight=sample_weight) # checking feature importance (three ways) res_default = clf.xgboost_classifier.get_fscore() res2 = clf._get_fscore() res3 = clf.feature_importances_ assert res_default == res2, res_default for i, val in enumerate(res3): if val > 0.0: assert val == res_default['f' + str(i)]
def test_folding_regressor_functions(): """Testing folding functions """ data, y, sample_weight = generate_classification_data() for X in [data, numpy.array(data)]: kfolder = FoldingRegressor(SklearnRegressor(GradientBoostingRegressor(n_estimators=5)), n_folds=2) kfolder.fit(X, y, sample_weight=sample_weight) preds = kfolder.predict(X) for p in kfolder.staged_predict(X): pass assert numpy.allclose(p, preds) importances = kfolder.feature_importances_ other_importances = kfolder.get_feature_importances()
def test_feature_splitter(): # testing splitter from rep.metaml import FeatureSplitter X, y, sample_weight = generate_classification_data(n_classes=3) split_column = X.columns[0] splitters = numpy.random.randint(0, 3, size=len(X)) X[split_column] = splitters X.ix[splitters == 1, :] += 4 X.ix[splitters == 2, :] -= 4 fs = FeatureSplitter(base_estimator=XGBoostClassifier(features=list(X.columns[1:]), n_estimators=10, max_depth=3), split_feature=split_column) fs.fit(X, y, sample_weight=sample_weight) assert fs.score(X, y) > 0.9
def test_theanets_partial_fit(): clf_complete = TheanetsClassifier(layers=[2], trainers=[{'algo': 'rmsprop', 'learning_rate': 0.1}, {'algo': 'rprop', 'learning_rate': 0.1}]) clf_partial = TheanetsClassifier(layers=[2], trainers=[{'algo': 'rmsprop', 'learning_rate': 0.1}]) X, y, sample_weight = generate_classification_data() clf_complete.fit(X, y) clf_partial.fit(X, y) clf_partial.partial_fit(X, y, algo='rprop', learning_rate=0.1) assert clf_complete.trainers == clf_partial.trainers, 'trainers not saved in partial fit' auc_complete = roc_auc_score(y, clf_complete.predict_proba(X)[:, 1]) auc_partial = roc_auc_score(y, clf_partial.predict_proba(X)[:, 1]) # Known fail of theanets assert auc_complete == auc_partial, 'same networks return different results'
def test_grid(): def generate_scorer(test, labels): def custom(base_estimator, params, X, y, sample_weight=None): cl = clone(base_estimator) cl.set_params(**params) if sample_weight is not None: cl.fit(X, y, sample_weight) else: cl.fit(X, y) return roc_auc_score(labels, cl.predict_proba(test)[:, 1]) return custom X, y, _ = generate_classification_data() grid_custom(generate_scorer(X, y)) run_grid(grid_sklearn) run_grid(grid_tmva)
def test_feature_splitter(): # testing splitter from rep.metaml import FeatureSplitter X, y, sample_weight = generate_classification_data(n_classes=3) split_column = X.columns[0] splitters = numpy.random.randint(0, 3, size=len(X)) X[split_column] = splitters X.ix[splitters == 1, :] += 4 X.ix[splitters == 2, :] -= 4 fs = FeatureSplitter(base_estimator=XGBoostClassifier(features=list( X.columns[1:]), n_estimators=10, max_depth=3), split_feature=split_column) fs.fit(X, y, sample_weight=sample_weight) assert fs.score(X, y) > 0.9
def test_theanets_partial_fit(): clf_complete = TheanetsClassifier(trainers=[{ 'optimize': 'rmsprop' }, { 'optimize': 'rprop' }]) clf_partial = TheanetsClassifier(trainers=[{'optimize': 'rmsprop'}]) X, y, sample_weight = generate_classification_data() clf_complete.fit(X, y) clf_partial.fit(X, y) clf_partial.partial_fit(X, y, optimize='rprop') assert clf_complete.trainers == clf_partial.trainers, 'trainers not saved in partial fit' auc_complete = roc_auc_score(y, clf_complete.predict_proba(X)[:, 1]) auc_partial = roc_auc_score(y, clf_partial.predict_proba(X)[:, 1]) assert auc_complete == auc_partial, 'same networks return different results'
def test_feature_splitter(): # testing splitter from rep.metaml import FeatureSplitter X, y, sample_weight = generate_classification_data(n_classes=3) split_column = X.columns[0] splitters = numpy.random.randint(0, 3, size=len(X)) X[split_column] = splitters X.ix[splitters == 1, :] += 4 X.ix[splitters == 2, :] -= 4 fs = FeatureSplitter(base_estimator=XGBoostClassifier(features=list(X.columns[1:]), n_estimators=10, max_depth=3), split_feature=split_column) fs.fit(X, y, sample_weight=sample_weight) assert fs.score(X, y) > 0.9 p_final = fs.predict_proba(X) for p in fs.staged_predict_proba(X): pass assert numpy.allclose(p_final, p), 'end of iterations differs from expected'
def test_feature_splitter(): # testing splitter from rep.metaml import FeatureSplitter X, y, sample_weight = generate_classification_data(n_classes=3) split_column = X.columns[0] splitters = numpy.random.randint(0, 3, size=len(X)) X[split_column] = splitters X.ix[splitters == 1, :] += 4 X.ix[splitters == 2, :] -= 4 fs = FeatureSplitter(base_estimator=XGBoostClassifier(n_estimators=10, max_depth=3), split_feature=split_column, train_features=list(X.columns[1:])) fs.fit(X, y, sample_weight=sample_weight) assert fs.score(X, y) > 0.9 p_final = fs.predict_proba(X) for p in fs.staged_predict_proba(X): pass assert numpy.allclose(p_final, p), 'end of iterations differs from expected'
def test_gridsearch_threads(n_threads=3): scorer = FoldingScorer(numpy.random.choice([OptimalAMS(), RocAuc()])) grid_param = OrderedDict({ "n_estimators": [10, 20], "learning_rate": [0.1, 0.05], 'features': [['column0', 'column1'], ['column0', 'column1', 'column2']] }) generator = RegressionParameterOptimizer(grid_param, n_evaluations=4) base = SklearnClassifier(clf=AdaBoostClassifier()) grid = GridOptimalSearchCV(base, generator, scorer, parallel_profile='threads-{}'.format(n_threads)) X, y, sample_weight = generate_classification_data() grid.fit(X, y, sample_weight=sample_weight)
def test_grid_with_custom_scorer(): """ Introducing here special scorer which always uses all data passed to gridsearch.fit as training and tests on another fixed dataset (which was passed to scorer) bu computing roc_auc_score from sklearn. """ class CustomScorer(object): def __init__(self, testX, testY): self.testY = testY self.testX = testX def __call__(self, base_estimator, params, X, y, sample_weight=None): cl = clone(base_estimator) cl.set_params(**params) if sample_weight is not None: cl.fit(X, y, sample_weight) else: cl.fit(X, y) return roc_auc_score(self.testY, cl.predict_proba(self.testX)[:, 1]) X, y, _ = generate_classification_data() custom_scorer = CustomScorer(X, y) grid_param = OrderedDict({ "n_estimators": [10, 20], "learning_rate": [0.1, 0.05], 'features': [['column0', 'column1'], ['column0', 'column1', 'column2']] }) generator = SubgridParameterOptimizer(grid_param) base_estimator = SklearnClassifier(clf=AdaBoostClassifier()) grid = GridOptimalSearchCV(base_estimator, generator, custom_scorer) cl = check_grid(grid, False, False, False) assert len(cl.features) <= 3 params = cl.get_params() for key in grid_param: if key in params: assert params[key] == grid.generator.best_params_[key] else: assert params['clf__' + key] == grid.generator.best_params_[key]
def test_grid_with_custom_scorer(): """ Introducing here special scorer which always uses all data passed to gridsearch.fit as training and tests on another fixed dataset (which was passed to scorer) bu computing roc_auc_score from sklearn. """ class CustomScorer(object): def __init__(self, testX, testY): self.testY = testY self.testX = testX def __call__(self, base_estimator, params, X, y, sample_weight=None): cl = clone(base_estimator) cl.set_params(**params) if sample_weight is not None: cl.fit(X, y, sample_weight) else: cl.fit(X, y) return roc_auc_score(self.testY, cl.predict_proba(self.testX)[:, 1]) X, y, _ = generate_classification_data() custom_scorer = CustomScorer(X, y) grid_param = OrderedDict({"n_estimators": [10, 20], "learning_rate": [0.1, 0.05], 'features': [['column0', 'column1'], ['column0', 'column1', 'column2']]}) generator = SubgridParameterOptimizer(grid_param) base_estimator = SklearnClassifier(clf=AdaBoostClassifier()) grid = GridOptimalSearchCV(base_estimator, generator, custom_scorer) cl = check_grid(grid, False, False, False) assert len(cl.features) <= 3 params = cl.get_params() for key in grid_param: if key in params: assert params[key] == grid.generator.best_params_[key] else: assert params['clf__' + key] == grid.generator.best_params_[key]
def test_gridsearch_metrics_threads(n_threads=3): X, y, sample_weight = generate_classification_data(n_classes=2, distance=0.7) param_grid = OrderedDict({ 'reg_param': numpy.linspace(0, 1, 20) }) from itertools import cycle optimizers = cycle([ RegressionParameterOptimizer(param_grid=param_grid, n_evaluations=4, start_evaluations=2), SubgridParameterOptimizer(param_grid=param_grid, n_evaluations=4), RandomParameterOptimizer(param_grid=param_grid, n_evaluations=4), ]) for metric in [RocAuc(), OptimalAMS(), OptimalSignificance(), log_loss]: scorer = FoldingScorer(metric) clf = SklearnClassifier(QuadraticDiscriminantAnalysis()) grid = GridOptimalSearchCV(estimator=clf, params_generator=next(optimizers), scorer=scorer, parallel_profile='threads-{}'.format(n_threads)) grid.fit(X, y) print(grid.params_generator.best_score_) print(grid.params_generator.best_params_) grid.params_generator.print_results()
def check_grid(estimator, check_instance=True, has_staged_pp=True, has_importances=True, use_weights=False, classification=True): if classification: X, y, sample_weight = generate_classification_data() else: X, y, sample_weight = generate_regression_data() assert len(sample_weight) == len(X), 'somehow lengths are different' if use_weights: assert estimator == estimator.fit(X, y, sample_weight=sample_weight) estimator = estimator.fit_best_estimator(X, y, sample_weight=sample_weight) else: assert estimator == estimator.fit(X, y) estimator = estimator.fit_best_estimator(X, y) if classification: check_classification_model(estimator, X, y, check_instance=check_instance, has_staged_pp=has_staged_pp, has_importances=has_importances) else: check_regression_model(estimator, X, y, check_instance=check_instance, has_stages=has_staged_pp, has_importances=has_importances) return estimator
def test_theanets_reproducibility(): clf = TheanetsClassifier(trainers=[{'min_improvement': 1}]) X, y, _ = generate_classification_data() check_classification_reproducibility(clf, X, y)
def test_Exception_trained_status(): X, _, _ = generate_classification_data() cl = MatrixNetClassifier(api_config_file=CONFIG_FILE_WRONG_URL, iterations=50) cl.training_status()
def test_mn_reproducibility(): clf = MatrixNetClassifier(iterations=10) X, y, _ = generate_classification_data() check_classification_reproducibility(clf, X, y)
def test_Exception_synchronized(): X, _, _ = generate_classification_data() cl = MatrixNetClassifier(api_config_file=CONFIG_FILE_WRONG_URL, iterations=50) cl.synchronize()
def test_neurolab_reproducibility(): clf = NeurolabClassifier(layers=[4, 5], epochs=2, trainf=nl.train.train_gd) X, y, _ = generate_classification_data() check_classification_reproducibility(clf, X, y)
def pybrain_test_partial_fit(): clf = PyBrainClassifier(layers=[4], epochs=2) X, y, _ = generate_classification_data() clf.partial_fit(X, y) clf.partial_fit(X[:2], y[:2])
def test_partial_fit(): clf = NeurolabClassifier(layers=[4, 5], epochs=2, trainf=nl.train.train_gd) X, y, _ = generate_classification_data() clf.fit(X, y) clf.partial_fit(X[:2], y[:2])