def tpe_classifier(name='clf'): linear_svc_space = hp.choice('liblinear_combination', [{'penalty': "l1", 'loss': "squared_hinge", 'dual': False}, {'penalty': "l2", 'loss': "hinge", 'dual': True}, {'penalty': "l2", 'loss': "squared_hinge", 'dual': True}, {'penalty': "l2", 'loss': "squared_hinge", 'dual': False}]) return hp.choice(name, [gaussian_nb('hpsklearn_gaussian_nb'), liblinear_svc('hpsklearn_liblinear_svc', C=hp.choice('hpsklearn_liblinear_svc_c', [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.]), loss=linear_svc_space['loss'], penalty=linear_svc_space['penalty'], dual=linear_svc_space['dual'], tol=hp.choice('hpsklearn_liblinear_svc_tol', [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]) ), decision_tree('decision_tree', criterion=hp.choice('decision_tree_criterion', ["gini", "entropy"]), max_depth=hp.randint('decision_tree_max_depth', 10) + 1, min_samples_split=hp.randint('decision_tree_min_samples_split', 19) + 2, min_samples_leaf=hp.randint('decision_tree_min_samples_leaf', 20) + 1), knn('knn', n_neighbors=hp.randint('knn_n', 100) + 1, weights=hp.choice('knn_weights', ['uniform', 'distance']), p=hp.choice('knn_p', [1, 2])), extra_trees('et', n_estimators=100, criterion=hp.choice('et_criterion', ["gini", "entropy"]), max_features=hp.randint('et_max_features', 20) * 0.05 + 0.05, min_samples_split=hp.randint('et_min_samples_split', 19) + 2, min_samples_leaf=hp.randint('et_min_samples_leaf', 20) + 1, bootstrap=hp.choice('et_bootstrap', [True, False])), random_forest('rf', n_estimators=100, criterion=hp.choice('rf_criterion', ["gini", "entropy"]), max_features=hp.randint('rf_max_features', 20) * 0.05 + 0.05, min_samples_split=hp.randint('rf_min_samples_split', 19) + 2, min_samples_leaf=hp.randint('rf_min_samples_leaf', 20) + 1, bootstrap=hp.choice('rf_bootstrap', [True, False])), gradient_boosting('gb', n_estimators=100, learning_rate=hp.choice('gb_lr', [1e-3, 1e-2, 1e-1, 0.5, 1.]), max_depth=hp.randint('gb_max_depth', 10) + 1, min_samples_split=hp.randint('gb_min_samples_split', 19) + 2, min_samples_leaf=hp.randint('gb_min_samples_leaf', 20) + 1, subsample=hp.randint('gb_subsample', 20) * 0.05 + 0.05, max_features=hp.randint('gb_max_features', 20) * 0.05 + 0.05, ) ])
def anySample2(): from hpsklearn import HyperoptEstimator, extra_trees from sklearn.datasets import fetch_mldata from hyperopt import tpe import numpy as np # Download the data and split into training and test sets digits = fetch_mldata('MNIST original') X = digits.data y = digits.target test_size = int(0.2 * len(y)) np.random.seed(13) indices = np.random.permutation(len(X)) X_train = X[ indices[:-test_size]] y_train = y[ indices[:-test_size]] X_test = X[ indices[-test_size:]] y_test = y[ indices[-test_size:]] # Instantiate a HyperoptEstimator with the search space and number of evaluations estim = HyperoptEstimator(classifier=extra_trees('my_clf'), preprocessing=[], algo=tpe.suggest, max_evals=10, trial_timeout=300) # Search the hyperparameter space based on the data estim.fit( X_train, y_train ) # Show the results print( estim.score( X_test, y_test ) ) # 0.962785714286 print( estim.best_model() ) # {'learner': ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='entropy', # max_depth=None, max_features=0.959202875857, # max_leaf_nodes=None, min_impurity_decrease=0.0, # min_impurity_split=None, min_samples_leaf=1, # min_samples_split=2, min_weight_fraction_leaf=0.0, # n_estimators=20, n_jobs=1, oob_score=False, random_state=3, # verbose=False, warm_start=False), 'preprocs': (), 'ex_preprocs': ()} pass
digits = fetch_mldata('MNIST original') X = digits.data y = digits.target test_size = int(0.2 * len(y)) np.random.seed(13) indices = np.random.permutation(len(X)) X_train = X[indices[:-test_size]] y_train = y[indices[:-test_size]] X_test = X[indices[-test_size:]] y_test = y[indices[-test_size:]] # Instantiate a HyperoptEstimator with the search space and number of evaluations estim = HyperoptEstimator(classifier=extra_trees('my_clf'), preprocessing=[], algo=tpe.suggest, max_evals=1, trial_timeout=300) # Search the hyperparameter space based on the data estim.fit(X_train, y_train) # Show the results print(estim.score(X_test, y_test)) # 0.962785714286 print(estim.best_model())
def main(): # Download the data and split into training and test sets iris = load_iris() X = iris.data y = iris.target test_size = int(0.2 * len(y)) np.random.seed(13) indices = np.random.permutation(len(X)) X_train = X[indices[:-test_size]] y_train = y[indices[:-test_size]] X_test = X[indices[-test_size:]] y_test = y[indices[-test_size:]] # for other datas, there will more complex data clearning # list all machine learning algorithms for hyper params tuning MLA = { 'rfc': [ RandomForestClassifier(), #RandomForestClassifier (n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1), { 'n_estimators': [50,100,200], #default=1.0 'criterion': ['entropy'], #edfault: auto 'max_depth': [4,5,6], #default:ovr #'min_samples_split': [5,10,.03,.05,.10], 'max_features': [.5], 'random_state': [1], }, random_forest('my_rfc'), ], 'etc': [ ExtraTreesClassifier(), #ExtraTreesClassifier (n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1), { 'n_estimators': [50,100,200], #default=1.0 'criterion': ['entropy'], #edfault: auto 'max_depth': [4,5,6], #default:ovr 'max_features': [.5], 'random_state': [1], }, extra_trees('my_etc'), ], 'gbc': [ GradientBoostingClassifier(), #GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1), { #'loss': ['deviance', 'exponential'], 'learning_rate': [.1,.25,.5], 'n_estimators': [50,100,200], #'criterion': ['friedman_mse', 'mse', 'mae'], 'max_depth': [4,5,6], 'max_features': [.5], #'min_samples_split': [5,10,.03,.05,.10], #'min_samples_leaf': [5,10,.03,.05,.10], 'random_state': [1], }, gradient_boosting('my_rgc'), ], 'lr': [ LogisticRegression(), #LogisticRegression(random_state=1) { #'fit_intercept': grid_bool, #'penalty': ['l1','l2'], #'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 'random_state': [1], }, ], 'svc': [ svm.SVC(), { #SVC - http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC #http://blog.hackerearth.com/simple-tutorial-svm-parameter-tuning-python-r #'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [1,2,3,4,5], #default=1.0 'gamma': [.1, .25, .5, .75, 1.0], #edfault: auto 'decision_function_shape': ['ovo', 'ovr'], #default:ovr 'probability': [True], 'random_state': [0] }, ], 'xgb': [ XGBClassifier(), { #XGBClassifier - http://xgboost.readthedocs.io/en/latest/parameter.html 'learning_rate': [.01, .03, .05, .1, .25], #default: .3 'max_depth': [1,2,4,6,8,10], #default 2 'n_estimators': [10, 50, 100, 300], 'seed': [0] }, ] } # list some algorithms for HyperoptEstimator, but error !!! #MLA2 = { #'rfc': [ #random_forest('my_rfc'), #], #'etc': [ #extra_trees('my_etc'), #], #'gbc': [ #gradient_boosting('my_rgc'), #], #} # list some algorithms for HyperoptEstimator, but error !!! def opt(clf): est = MLA[clf][0] # ---------want to use Hyperopt, but has some errors !!! #estim = HyperoptEstimator(classifier=MLA2[clf][0], #preprocessing=[], #algo=tpe.suggest, #max_evals=3, #trial_timeout=120) #estim.fit( X_train, y_train ) #est = estim # ---------want to use Hyperopt, but has some errors !!! # use GridSearchCV, it's too slow est = model_selection.GridSearchCV(estimator=est, param_grid=MLA[clf][1], cv=5) # --, scoring='roc_auc' return est # for StackNetClassifier #models=[ ######### First level ######## #[RandomForestClassifier(n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1), #ExtraTreesClassifier(n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1), #GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1), #LogisticRegression(random_state=1) #], ######### Second level ######## #[RandomForestClassifier(n_estimators=200, criterion="entropy", max_depth=5, max_features=0.5, random_state=1)] #] models=[ ######## First level ######## [ opt('rfc'), opt('etc'), #opt('gbc'), #opt('lr'), ], ######## Second level ######## [ opt('rfc'), ], ] # use StackNet to stacking the models StackNetmodel=StackNetClassifier(models, folds=4, # --metric="auc", restacking=False, use_retraining=True, use_proba=True, random_state=12345, n_jobs=1, verbose=1) StackNetmodel.fit(X_train, y_train)