def test_theanets_single_classification(): check_classifier(TheanetsClassifier(), supports_weight=False, has_staged_pp=False, has_importances=False) check_classifier(TheanetsClassifier(layers=[]), supports_weight=False, has_staged_pp=False, has_importances=False) check_classifier(TheanetsClassifier(layers=[20], trainers=[{ 'optimize': 'sgd', 'learning_rate': 0.3 }]), supports_weight=False, has_staged_pp=False, has_importances=False) check_classifier(TheanetsClassifier(layers=[5, 5], trainers=[{ 'optimize': 'sgd', 'learning_rate': 0.3 }]), supports_weight=False, has_staged_pp=False, has_importances=False) check_classifier(TheanetsClassifier(layers=[5, 5], trainers=[{ 'optimize': 'sgd', 'learning_rate': 0.3 }]), supports_weight=False, has_staged_pp=False, has_importances=False)
def test_theanets_configurations(): check_classifier( TheanetsClassifier(layers=[13], scaler=False, trainers=[{'algo': 'nag', 'learning_rate': 0.1}]), **classifier_params) check_classifier( TheanetsClassifier(layers=[5, 5], scaler='minmax', trainers=[{'algo': 'adadelta', 'learning_rate': 0.1}]), **classifier_params)
def test_theanets_single_classification(): check_classifier(TheanetsClassifier(trainers=[{ 'patience': 0 }]), **classifier_params) check_classifier( TheanetsClassifier(layers=[], scaler='minmax', trainers=[{ 'patience': 0 }]), **classifier_params)
def test_theanets_configurations(): check_classifier( TheanetsClassifier( layers=[13], scaler=False, trainers=[dict(algo='nag', learning_rate=0.1, **impatient)]), **classifier_params) check_classifier( TheanetsClassifier( layers=[5, 5], trainers=[dict(algo='adam', learning_rate=0.01, momentum=0.9)]), **classifier_params)
def test_theanets_partial_fit(): clf_complete = TheanetsClassifier(trainers=[{'optimize': 'rmsprop', 'patience': 0}, {'optimize': 'rprop'}]) clf_partial = TheanetsClassifier(trainers=[{'optimize': 'rmsprop', 'patience': 0}]) X, y, sample_weight = generate_classification_data() clf_complete.fit(X, y) clf_partial.fit(X, y) clf_partial.partial_fit(X, y, optimize='rprop') assert clf_complete.trainers == clf_partial.trainers, 'trainers not saved in partial fit' auc_complete = roc_auc_score(y, clf_complete.predict_proba(X)[:, 1]) auc_partial = roc_auc_score(y, clf_partial.predict_proba(X)[:, 1]) assert auc_complete == auc_partial, 'same networks return different results'
def test_theanets_configurations(): check_classifier( TheanetsClassifier(layers=[20], scaler=False, trainers=[{ 'optimize': 'nag', 'learning_rate': 0.3, 'min_improvement': 0.5 }]), **classifier_params) check_classifier( TheanetsClassifier(layers=[5, 5], trainers=[{ 'optimize': 'nag', 'learning_rate': 0.3, 'min_improvement': 0.5 }]), **classifier_params)
def test_theanets_multiclassification(): check_classifier(TheanetsClassifier(trainers=[{ 'min_improvement': 0.1, 'learning_rate': 0.1 }]), n_classes=4, **classifier_params)
def test_theanets_partial_fit(): clf_complete = TheanetsClassifier(layers=[2], trainers=[{'algo': 'rmsprop', 'learning_rate': 0.1}, {'algo': 'rprop', 'learning_rate': 0.1}]) clf_partial = TheanetsClassifier(layers=[2], trainers=[{'algo': 'rmsprop', 'learning_rate': 0.1}]) X, y, sample_weight = generate_classification_data() clf_complete.fit(X, y) clf_partial.fit(X, y) clf_partial.partial_fit(X, y, algo='rprop', learning_rate=0.1) assert clf_complete.trainers == clf_partial.trainers, 'trainers not saved in partial fit' auc_complete = roc_auc_score(y, clf_complete.predict_proba(X)[:, 1]) auc_partial = roc_auc_score(y, clf_partial.predict_proba(X)[:, 1]) # Known fail of theanets assert auc_complete == auc_partial, 'same networks return different results'
def test_theanets_simple_stacking(): base_tnt = TheanetsClassifier() check_classifier(SklearnClassifier( clf=BaggingClassifier(base_estimator=base_tnt, n_estimators=3)), supports_weight=False, has_staged_pp=False, has_importances=False)
def test_theanets_multiple_classification(): check_classifier( TheanetsClassifier(trainers=[{ 'optimize': 'adadelta', 'min_improvement': 0.5 }, { 'optimize': 'nag' }]), **classifier_params)
def test_theanets_reproducibility(): clf = TheanetsClassifier(trainers=[{ 'algo': 'nag', 'min_improvement': 0.1, 'max_updates': 10 }]) X, y, _ = generate_classification_data() check_classification_reproducibility(clf, X, y)
def test_theanets_reproducibility(): clf = TheanetsClassifier(trainers=[{ 'algo': 'nag', 'min_improvement': 0.1 }]) X, y, _ = generate_classification_data() import numpy numpy.random.seed(43) check_classification_reproducibility(clf, X, y)
def test_theanets_multiple_classification(): check_classifier(TheanetsClassifier(trainers=[{ 'optimize': 'adadelta' }, { 'optimize': 'nag' }]), supports_weight=False, has_staged_pp=False, has_importances=False)
def test_pretrain(): clf = TheanetsClassifier(layers=[5, 5], trainers=[{ 'algo': 'pretrain', 'learning_rate': 0.1 }, { 'algo': 'nag', 'learning_rate': 0.1 }]) check_classifier(clf, **classifier_params)
def test_pretrain(): clf = TheanetsClassifier(trainers=[{ 'optimize': 'pretrain', 'patience': 1, 'learning_rate': 0.1 }, { 'optimize': 'nag', 'patience': 1 }]) check_classifier(clf, **classifier_params)
def test_pretrain(): trainX, trainY, _ = generate_classification_data() trainers = [{ 'algo': 'pretrain', 'learning_rate': 0.5, 'patience': 1, 'validate_every': 1 }] # only checking that fitting doesn't throw errors # this frequently gets stuck on CI TheanetsClassifier(layers=[5], trainers=trainers).fit(trainX, trainY)
def test_theanets_reproducibility(): clf = TheanetsClassifier() X, y, sample_weight = generate_classification_data() clf.fit(X, y) auc = roc_auc_score(y, clf.predict_proba(X)[:, 1]) for i in range(2): clf.fit(X, y) curr_auc = roc_auc_score(y, clf.predict_proba(X)[:, 1]) assert auc == curr_auc, 'running a network twice produces different results' cloned_clf = clone(clf) cloned_clf.fit(X, y) cloned_auc = roc_auc_score(y, cloned_clf.predict_proba(X)[:, 1]) assert cloned_auc == auc, 'cloned network produces different result'
def _make_clf(self, clf, bagging=None): """Creates a classifier from a dict or returns the clf""" if isinstance(clf, dict): key, val = clf.popitem() try: val = self.__DEFAULT_CLF_CFG.get(key) if val is None else val except KeyError: logger.error(str(val) + " not an implemented classifier.") raise temp_bagging = val.pop('bagging', bagging) bagging = temp_bagging if bagging is None else bagging if key == 'rdf': config_clf = dict(val) # possible multi-threading arguments clf = SklearnClassifier(RandomForestClassifier(**config_clf)) elif key == 'erf': config_clf = dict(val) # possible multi-threading arguments clf = SklearnClassifier(ExtraTreesClassifier(**config_clf)) elif key == 'nn': config_clf = dict(val) # possible multi-threading arguments clf = TheanetsClassifier(**config_clf) elif key == 'ada': config_clf = dict(val) # possible multi-threading arguments clf = SklearnClassifier(AdaBoostClassifier(**config_clf)) elif key == 'gb': config_clf = dict(val) # possible multi-threading arguments clf = SklearnClassifier(GradientBoostingClassifier(**config_clf)) elif key == 'xgb': config_clf = dict(val) # possible multi-threading arguments clf = XGBoostClassifier(**config_clf) elif hasattr(clf, 'fit'): bagging = False # return the classifier # bagging over the instantiated estimators if isinstance(bagging, int) and bagging >= 1: bagging = dict(self.__DEFAULT_BAG_CFG, n_estimators=bagging) if isinstance(bagging, dict): # TODO: implement multi-thread: bagging.update({'base_estimator': clf}) clf = SklearnClassifier(BaggingClassifier(**bagging)) else: raise ValueError(str(clf) + " not valid as a classifier.") clf = {key: clf} return clf
def clf_mayou(data1, data2, n_folds=3, n_base_clf=5): """DEVELOPEMENT, WIP. Test a setup of clf involving bagging and stacking.""" # import raredecay.analysis.ml_analysis as ml_ana # import pandas as pd import copy from rep.estimators import SklearnClassifier, XGBoostClassifier from rep.metaml.folding import FoldingClassifier from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier from sklearn.ensemble import BaggingClassifier # , VotingClassifier, AdaBoostClassifier from rep.estimators.theanets import TheanetsClassifier from sklearn.linear_model import LogisticRegression from rep.metaml.cache import CacheClassifier from rep.report.metrics import RocAuc import rep.metaml.cache from rep.metaml._cache import CacheHelper rep.metaml.cache.cache_helper = CacheHelper('/home/mayou/cache', 100000) # data1.make_folds(n_folds) # data2.make_folds(n_folds) output = {} # for i in range(n_folds): xgb_clf = XGBoostClassifier(n_estimators=350, eta=0.1, max_depth=4, nthreads=3) xgb_folded = FoldingClassifier(base_estimator=xgb_clf, stratified=True, parallel_profile='threads-2') xgb_bagged = BaggingClassifier(base_estimator=xgb_folded, n_estimators=n_base_clf, bootstrap=False) xgb_bagged = SklearnClassifier(xgb_bagged) xgb_big_stacker = copy.deepcopy(xgb_bagged) xgb_bagged = CacheClassifier(name='xgb_bagged1', clf=xgb_bagged) xgb_single = XGBoostClassifier(n_estimators=350, eta=0.1, max_depth=4, nthreads=3) xgb_single = FoldingClassifier(base_estimator=xgb_single, stratified=True, n_folds=10, parallel_profile='threads-2') xgb_single = CacheClassifier(name='xgb_singled1', clf=xgb_single) rdf_clf = SklearnClassifier( RandomForestClassifier(n_estimators=300, n_jobs=3)) rdf_folded = FoldingClassifier(base_estimator=rdf_clf, stratified=True, parallel_profile='threads-2') rdf_bagged = BaggingClassifier(base_estimator=rdf_folded, n_estimators=n_base_clf, bootstrap=False) rdf_bagged = SklearnClassifier(rdf_bagged) rdf_bagged = CacheClassifier(name='rdf_bagged1', clf=rdf_bagged) gb_clf = SklearnClassifier(GradientBoostingClassifier(n_estimators=50)) gb_folded = FoldingClassifier(base_estimator=gb_clf, stratified=True, parallel_profile='threads-6') gb_bagged = BaggingClassifier(base_estimator=gb_folded, n_estimators=n_base_clf, bootstrap=False, n_jobs=5) gb_bagged = SklearnClassifier(gb_bagged) gb_bagged = CacheClassifier(name='gb_bagged1', clf=gb_bagged) nn_clf = TheanetsClassifier(layers=[300, 300], hidden_dropout=0.03, trainers=[{ 'optimize': 'adagrad', 'patience': 5, 'learning_rate': 0.2, 'min_improvement': 0.1, 'momentum': 0.4, 'nesterov': True, 'loss': 'xe' }]) nn_folded = FoldingClassifier(base_estimator=nn_clf, stratified=True, parallel_profile=None) # 'threads-6') nn_bagged = BaggingClassifier(base_estimator=nn_folded, n_estimators=n_base_clf, bootstrap=False, n_jobs=1) nn_bagged = CacheClassifier(name='nn_bagged1', clf=nn_bagged) nn_single_clf = TheanetsClassifier(layers=[300, 300, 300], hidden_dropout=0.03, trainers=[{ 'optimize': 'adagrad', 'patience': 5, 'learning_rate': 0.2, 'min_improvement': 0.1, 'momentum': 0.4, 'nesterov': True, 'loss': 'xe' }]) nn_single = FoldingClassifier(base_estimator=nn_single_clf, n_folds=3, stratified=True) nn_single = CacheClassifier(name='nn_single1', clf=nn_single) logit_stacker = SklearnClassifier( LogisticRegression(penalty='l2', solver='sag')) logit_stacker = FoldingClassifier(base_estimator=logit_stacker, n_folds=n_folds, stratified=True, parallel_profile='threads-6') logit_stacker = CacheClassifier(name='logit_stacker1', clf=logit_stacker) xgb_stacker = XGBoostClassifier(n_estimators=400, eta=0.1, max_depth=4, nthreads=8) # HACK xgb_stacker = xgb_big_stacker xgb_stacker = FoldingClassifier(base_estimator=xgb_stacker, n_folds=n_folds, random_state=42, stratified=True, parallel_profile='threads-6') xgb_stacker = CacheClassifier(name='xgb_stacker1', clf=xgb_stacker) # train1, test1 = data1.get_fold(i) # train2, test2 = data1.get_fold(i) # # t_data, t_targets, t_weights = data, targets, weights = data1.make_dataset(data2, weights_ratio=1) # xgb_bagged.fit(data, targets, weights) # xgb_report = xgb_bagged.test_on(data, targets, weights) # xgb_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC xgb_base classifier") # output['xgb_base'] = "roc auc:" + str(xgb_report.compute_metric(metric=RocAuc())) # xgb_proba = xgb_report.prediction['clf'][:, 1] # del xgb_bagged, xgb_folded, xgb_clf, xgb_report # # xgb_single.fit(data, targets, weights) # xgb_report = xgb_single.test_on(data, targets, weights) # xgb_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC xgb_single classifier") # output['xgb_single'] = "roc auc:" + str(xgb_report.compute_metric(metric=RocAuc())) # xgb_proba = xgb_report.prediction['clf'][:, 1] # del xgb_single, xgb_report nn_single.fit(data, targets, weights) nn_report = nn_single.test_on(data, targets, weights) nn_report.roc(physics_notion=True).plot( new_plot=True, title="ROC AUC nn_single classifier") output['nn_single'] = "roc auc:" + str( nn_report.compute_metric(metric=RocAuc())) # nn_proba = nn_report.prediction['clf'][:, 1] del nn_single, nn_report # rdf_bagged.fit(data, targets, weights) # rdf_report = rdf_bagged.test_on(data, targets, weights) # rdf_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC rdf_base classifier") # output['rdf_base'] = "roc auc:" + str(rdf_report.compute_metric(metric=RocAuc())) # rdf_proba = rdf_report.prediction['clf'][:, 1] # del rdf_bagged, rdf_clf, rdf_folded, rdf_report # gb_bagged.fit(data, targets, weights) # gb_report = gb_bagged.test_on(data, targets, weights) # gb_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC gb_base classifier") # output['gb_base'] = "roc auc:" + str(gb_report.compute_metric(metric=RocAuc())) # gb_proba = gb_report.prediction['clf'][:, 1] # del gb_bagged, gb_clf, gb_folded, gb_report # nn_bagged.fit(data, targets, weights) # nn_report = nn_bagged.test_on(data, targets, weights) # nn_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC nn_base classifier") # output['nn_base'] = "roc auc:" + str(nn_report.compute_metric(metric=RocAuc())) # nn_proba = nn_report.prediction['clf'][:, 1] # del nn_bagged, nn_clf, nn_folded, nn_report # # base_predict = pd.DataFrame({'xgb': xgb_proba, # #'rdf': rdf_proba, # #'gb': gb_proba, # 'nn': nn_proba # }) # # # xgb_stacker.fit(base_predict, targets, weights) # xgb_report = xgb_stacker.test_on(base_predict, targets, weights) # xgb_report.roc(physics_notion=True).plot(new_plot=True, # title="ROC AUC xgb_stacked classifier") # output['stacker_xgb'] = "roc auc:" + str(xgb_report.compute_metric(metric=RocAuc())) # del xgb_stacker, xgb_report # # logit_stacker.fit(base_predict, targets, weights) # logit_report = logit_stacker.test_on(base_predict, targets, weights) # logit_report.roc(physics_notion=True).plot(new_plot=True, # title="ROC AUC logit_stacked classifier") # output['stacker_logit'] = "roc auc:" + str(logit_report.compute_metric(metric=RocAuc())) # del logit_stacker, logit_report print output
def test_theanets_reproducibility(): clf = TheanetsClassifier(trainers=[{'min_improvement': 1}]) X, y, _ = generate_classification_data() check_classification_reproducibility(clf, X, y)
def test_theanets_multiclassification(): check_classifier(TheanetsClassifier(trainers=[{ 'patience': 0 }]), n_classes=4, **classifier_params)
def test_theanets_partial_fit(): clf_complete = TheanetsClassifier(trainers=[{ 'optimize': 'rmsprop' }, { 'optimize': 'rprop' }]) clf_partial = TheanetsClassifier(trainers=[{'optimize': 'rmsprop'}]) X, y, sample_weight = generate_classification_data() clf_complete.fit(X, y) clf_partial.fit(X, y) clf_partial.partial_fit(X, y, optimize='rprop') assert clf_complete.trainers == clf_partial.trainers, 'trainers not saved in partial fit' auc_complete = roc_auc_score(y, clf_complete.predict_proba(X)[:, 1]) auc_partial = roc_auc_score(y, clf_partial.predict_proba(X)[:, 1]) assert auc_complete == auc_partial, 'same networks return different results'
def test_theanets_simple_stacking(): base_tnt = TheanetsClassifier(trainers=[{'min_improvement': 0.1}]) base_bagging = BaggingClassifier(base_estimator=base_tnt, n_estimators=3) check_classifier(SklearnClassifier(clf=base_bagging), **classifier_params)
def test_theanets_partial_fit(): clf_complete = TheanetsClassifier(layers=[2], trainers=[{ 'algo': 'rmsprop', 'learning_rate': 0.1 }, { 'algo': 'rprop', 'learning_rate': 0.1 }]) clf_partial = TheanetsClassifier(layers=[2], trainers=[{ 'algo': 'rmsprop', 'learning_rate': 0.1 }]) X, y, sample_weight = generate_classification_data() clf_complete.fit(X, y) clf_partial.fit(X, y) clf_partial.partial_fit(X, y, algo='rprop', learning_rate=0.1) assert clf_complete.trainers == clf_partial.trainers, 'trainers not saved in partial fit' auc_complete = roc_auc_score(y, clf_complete.predict_proba(X)[:, 1]) auc_partial = roc_auc_score(y, clf_partial.predict_proba(X)[:, 1]) # Known fail of theanets assert auc_complete == auc_partial, 'same networks return different results'