def randomforst_model_tpe(): estim = HyperoptEstimator(classifier=random_forest('my_clf'), preprocessing=[pca('my_pca')], algo=tpe.suggest, max_evals=150, trial_timeout=60, verbose=0) estim.fit(x_train, y_train) print("f1score", f1_score(estim.predict(x_test), y_test)) print("accuracy score", accuracy_score(estim.predict(x_test), y_test)) print(estim.best_model())
def bench_classifiers(name): classifiers = [ ada_boost(name + '.ada_boost'), # boo gaussian_nb(name + '.gaussian_nb'), # eey knn(name + '.knn', sparse_data=True), # eey linear_discriminant_analysis(name + '.linear_discriminant_analysis', n_components=1), # eey random_forest(name + '.random_forest'), # boo sgd(name + '.sgd') # eey ] if xgboost: classifiers.append(xgboost_classification(name + '.xgboost')) # boo return hp.choice('%s' % name, classifiers)
def tpe_classifier(name='clf'): linear_svc_space = hp.choice('liblinear_combination', [{'penalty': "l1", 'loss': "squared_hinge", 'dual': False}, {'penalty': "l2", 'loss': "hinge", 'dual': True}, {'penalty': "l2", 'loss': "squared_hinge", 'dual': True}, {'penalty': "l2", 'loss': "squared_hinge", 'dual': False}]) return hp.choice(name, [gaussian_nb('hpsklearn_gaussian_nb'), liblinear_svc('hpsklearn_liblinear_svc', C=hp.choice('hpsklearn_liblinear_svc_c', [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.]), loss=linear_svc_space['loss'], penalty=linear_svc_space['penalty'], dual=linear_svc_space['dual'], tol=hp.choice('hpsklearn_liblinear_svc_tol', [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]) ), decision_tree('decision_tree', criterion=hp.choice('decision_tree_criterion', ["gini", "entropy"]), max_depth=hp.randint('decision_tree_max_depth', 10) + 1, min_samples_split=hp.randint('decision_tree_min_samples_split', 19) + 2, min_samples_leaf=hp.randint('decision_tree_min_samples_leaf', 20) + 1), knn('knn', n_neighbors=hp.randint('knn_n', 100) + 1, weights=hp.choice('knn_weights', ['uniform', 'distance']), p=hp.choice('knn_p', [1, 2])), extra_trees('et', n_estimators=100, criterion=hp.choice('et_criterion', ["gini", "entropy"]), max_features=hp.randint('et_max_features', 20) * 0.05 + 0.05, min_samples_split=hp.randint('et_min_samples_split', 19) + 2, min_samples_leaf=hp.randint('et_min_samples_leaf', 20) + 1, bootstrap=hp.choice('et_bootstrap', [True, False])), random_forest('rf', n_estimators=100, criterion=hp.choice('rf_criterion', ["gini", "entropy"]), max_features=hp.randint('rf_max_features', 20) * 0.05 + 0.05, min_samples_split=hp.randint('rf_min_samples_split', 19) + 2, min_samples_leaf=hp.randint('rf_min_samples_leaf', 20) + 1, bootstrap=hp.choice('rf_bootstrap', [True, False])), gradient_boosting('gb', n_estimators=100, learning_rate=hp.choice('gb_lr', [1e-3, 1e-2, 1e-1, 0.5, 1.]), max_depth=hp.randint('gb_max_depth', 10) + 1, min_samples_split=hp.randint('gb_min_samples_split', 19) + 2, min_samples_leaf=hp.randint('gb_min_samples_leaf', 20) + 1, subsample=hp.randint('gb_subsample', 20) * 0.05 + 0.05, max_features=hp.randint('gb_max_features', 20) * 0.05 + 0.05, ) ])
def main(): df_train = pd.read_csv('../train_dataset.csv') df_test = pd.read_csv('../test_dataset.csv') X_train, y_train = df_train.iloc[:, 2:].values, df_train.iloc[:, 0].values X_test, y_test = df_test.iloc[:, 2:].values, df_test.iloc[:, 0].values estim = HyperoptEstimator(classifier=random_forest('myDT'), algo=tpe.suggest, max_evals=150, trial_timeout=120, verbose=True) estim.fit(X_train, y_train) print("\n\n{}\n\n".format(estim.score(X_test, y_test))) print("\n\n{}\n\n".format(estim.best_model()))
print("\ndata is loaded - next step > model testing\n") n_job = 6 select_classes = [0, 1, 2, 3, 4, 5] val_dist = X_val_mini.shape[0] / X_train_mini.shape[0] name = 'my_est_oVa' tic_mod_all = time.time() select_alg = [ ada_boost(name + '.ada_boost'), gaussian_nb(name + '.gaussian_nb'), knn(name + '.knn', sparse_data=True), linear_discriminant_analysis(name + '.linear_discriminant_analysis', n_components=1), random_forest(name + '.random_forest'), sgd(name + '.sgd'), xgboost_classification(name + '.xgboost') ] # fitting models estim_one_vs_rest = dict() # scoring models algo_scoring = dict() save_score_path = r'C:/Users/anden/PycharmProjects/NovelEEG/results' for alg in [select_alg[args.index]]: tic_mod = time.time() print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", "running on %s" % (alg.name + '.one_V_all'), "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") clf_method = one_vs_rest(str(alg.name + '.one_V_all'),
def main(): # Download the data and split into training and test sets iris = load_iris() X = iris.data y = iris.target test_size = int(0.2 * len(y)) np.random.seed(13) indices = np.random.permutation(len(X)) X_train = X[indices[:-test_size]] y_train = y[indices[:-test_size]] X_test = X[indices[-test_size:]] y_test = y[indices[-test_size:]] # for other datas, there will more complex data clearning # list all machine learning algorithms for hyper params tuning MLA = { 'rfc': [ RandomForestClassifier(), #RandomForestClassifier (n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1), { 'n_estimators': [50,100,200], #default=1.0 'criterion': ['entropy'], #edfault: auto 'max_depth': [4,5,6], #default:ovr #'min_samples_split': [5,10,.03,.05,.10], 'max_features': [.5], 'random_state': [1], }, random_forest('my_rfc'), ], 'etc': [ ExtraTreesClassifier(), #ExtraTreesClassifier (n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1), { 'n_estimators': [50,100,200], #default=1.0 'criterion': ['entropy'], #edfault: auto 'max_depth': [4,5,6], #default:ovr 'max_features': [.5], 'random_state': [1], }, extra_trees('my_etc'), ], 'gbc': [ GradientBoostingClassifier(), #GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1), { #'loss': ['deviance', 'exponential'], 'learning_rate': [.1,.25,.5], 'n_estimators': [50,100,200], #'criterion': ['friedman_mse', 'mse', 'mae'], 'max_depth': [4,5,6], 'max_features': [.5], #'min_samples_split': [5,10,.03,.05,.10], #'min_samples_leaf': [5,10,.03,.05,.10], 'random_state': [1], }, gradient_boosting('my_rgc'), ], 'lr': [ LogisticRegression(), #LogisticRegression(random_state=1) { #'fit_intercept': grid_bool, #'penalty': ['l1','l2'], #'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 'random_state': [1], }, ], 'svc': [ svm.SVC(), { #SVC - http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC #http://blog.hackerearth.com/simple-tutorial-svm-parameter-tuning-python-r #'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [1,2,3,4,5], #default=1.0 'gamma': [.1, .25, .5, .75, 1.0], #edfault: auto 'decision_function_shape': ['ovo', 'ovr'], #default:ovr 'probability': [True], 'random_state': [0] }, ], 'xgb': [ XGBClassifier(), { #XGBClassifier - http://xgboost.readthedocs.io/en/latest/parameter.html 'learning_rate': [.01, .03, .05, .1, .25], #default: .3 'max_depth': [1,2,4,6,8,10], #default 2 'n_estimators': [10, 50, 100, 300], 'seed': [0] }, ] } # list some algorithms for HyperoptEstimator, but error !!! #MLA2 = { #'rfc': [ #random_forest('my_rfc'), #], #'etc': [ #extra_trees('my_etc'), #], #'gbc': [ #gradient_boosting('my_rgc'), #], #} # list some algorithms for HyperoptEstimator, but error !!! def opt(clf): est = MLA[clf][0] # ---------want to use Hyperopt, but has some errors !!! #estim = HyperoptEstimator(classifier=MLA2[clf][0], #preprocessing=[], #algo=tpe.suggest, #max_evals=3, #trial_timeout=120) #estim.fit( X_train, y_train ) #est = estim # ---------want to use Hyperopt, but has some errors !!! # use GridSearchCV, it's too slow est = model_selection.GridSearchCV(estimator=est, param_grid=MLA[clf][1], cv=5) # --, scoring='roc_auc' return est # for StackNetClassifier #models=[ ######### First level ######## #[RandomForestClassifier(n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1), #ExtraTreesClassifier(n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1), #GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1), #LogisticRegression(random_state=1) #], ######### Second level ######## #[RandomForestClassifier(n_estimators=200, criterion="entropy", max_depth=5, max_features=0.5, random_state=1)] #] models=[ ######## First level ######## [ opt('rfc'), opt('etc'), #opt('gbc'), #opt('lr'), ], ######## Second level ######## [ opt('rfc'), ], ] # use StackNet to stacking the models StackNetmodel=StackNetClassifier(models, folds=4, # --metric="auc", restacking=False, use_retraining=True, use_proba=True, random_state=12345, n_jobs=1, verbose=1) StackNetmodel.fit(X_train, y_train)
from hyperopt import tpe X = df[features].values y = df["outcome"] test_size = int(0.1 * len(y)) np.random.seed(10) indices = np.random.permutation(len(X)) X_train = np.float64(X[indices[:-test_size]]) y_train = np.float64(y[indices[:-test_size]]) X_test = np.float64(X[indices[-test_size:]]) y_test = np.float64(y[indices[-test_size:]]) estim = HyperoptEstimator(algo=tpe.suggest, trial_timeout=300, classifier=random_forest('my_random_forest')) estim.fit(X_train, y_train) print(estim.score(X_test, y_test)) # <<show score here>> print(estim.best_model()) # In[Another look at error: Out of bag error, plot written by Kian Ho]: import matplotlib.pyplot as plt from collections import OrderedDict from sklearn.datasets import make_classification from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import RepeatedStratifiedKFold from sklearn.metrics import * from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.decomposition import PCA from sklearn.manifold import TSNE import matplotlib.pyplot as plt from hpsklearn import HyperoptEstimator, svc, random_forest, knn from hyperopt import tpe from sklearn.metrics import f1_score def scorer(yt, yp): return 1 - f1_score(yt, yp, average='macro') if __name__ == '__main__': np.random.seed(42) train_X = np.load('data/train_X.npy') test_X = np.load('data/test_X.npy') train_Y = np.load('data/train_Y.npy') test_Y = np.load('data/test_Y.npy') estim = HyperoptEstimator(classifier=random_forest('rf'), algo=tpe.suggest, loss_fn=scorer, max_evals=200, trial_timeout=1200) estim.fit(train_X, train_Y) yp = estim.predict(test_X) print(f1_score(test_Y, yp, average='macro'))