from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures from sklearn.tree import DecisionTreeClassifier from tpot.builtins import StackingEstimator # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: 0.9353463587921848 exported_pipeline = make_pipeline( PolynomialFeatures(degree=2, include_bias=False, interaction_only=False), StackingEstimator(estimator=ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.55, min_samples_leaf=2, min_samples_split=5, n_estimators=100)), MinMaxScaler(), StackingEstimator(estimator=BernoulliNB(alpha=0.001, fit_prior=True)), DecisionTreeClassifier(criterion="entropy", max_depth=5, min_samples_leaf=12, min_samples_split=9)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
# Define the parameter grid parameter_grid = [{ 'n_estimators': [100], 'max_depth': [2, 4, 7, 12, 16] }, { 'max_depth': [4], 'n_estimators': [25, 50, 100, 250] }] metrics = ['precision_weighted', 'recall_weighted'] for metric in metrics: print("\n##### Searching optimal parameters for", metric) classifier = GridSearchCV(ExtraTreesClassifier(random_state=0), parameter_grid, cv=5, scoring=metric) classifier.fit(X_train, y_train) means = classifier.cv_results_['mean_test_score'] # LOTS OF ERRORS HERE print("\nGrid scores for the parameter grid:") for results, mean in zip(classifier.cv_results_['params'], means): # ERROR HERE print(results, " -> ", "%.3f" % mean) print("\nBest parameters:", classifier.best_params_) y_pred = classifier.predict(X_test) print("\nPerformance report:\n")
estimators = RF.estimators_ a = get_ensemble_score(estimators[:k], X_test, y_test) scores1.append(a) meta_scores.append(scores1) meta_scores = np.array(meta_scores) s = np.mean(meta_scores, axis=0).tolist() RF_cross_val_scores.append(s) pickle.dump(RF_cross_val_scores, open('bag_sabit2_cross_val_scores', 'wb')) RF_cross_val_scores = [] for i, data in enumerate(datasets[:]): X, y = arff_to_numpy('Datasets/' + str(data) + '.arff') meta_scores = [] kf = KFold(n_splits=3, shuffle=True) for train_index, test_index in kf.split(X): scores1 = [] X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None]: RF = ExtraTreesClassifier(max_depth=k, n_estimators=50, n_jobs=-1) RF.fit(X_train, y_train) estimators = RF.estimators_ a = get_ensemble_score(estimators[:k], X_test, y_test) scores1.append(a) meta_scores.append(scores1) meta_scores = np.array(meta_scores) s = np.mean(meta_scores, axis=0).tolist() RF_cross_val_scores.append(s) pickle.dump(RF_cross_val_scores, open('et_sabit2_cross_val_scores', 'wb'))
else: movie_reviews_data_folder = "/home/gregor/ipyServer/data/movie_review/train_sub" movie_reviews_test_data_folder = "/home/gregor/ipyServer/data/movie_review/test_sub" dataset = load_files(movie_reviews_data_folder, shuffle=False) test_data = load_files(movie_reviews_test_data_folder, shuffle=False) print(len(test_data)) print("n_samples: %d\n" % len(dataset.data)) # Build vectorizer vectorizer = TfidfVectorizer(sublinear_tf=False, max_df=0.1, ngram_range=(1, 2)) text_clf = ExtraTreesClassifier(max_depth=1024, min_samples_leaf=8, min_samples_split=16) reduceParams = 80 * 1000 metrics_out = '/home/gregor/ipyServer/movie_review/output/metrics_ExTrees.out' kaggle_test_out = '/home/gregor/ipyServer/movie_review/output/kaggle_test_04_ExTrees.csv' ################################################################################# outfile = open(metrics_out, 'a+') outfile.write(('*' * 70) + '\n') outfile.write(('*' * 5) + (' ' * 25) + 'Begin Run' + (' ' * 25) + ('*' * 5) + '\n') outfile.write(('*' * 70) + '\n\n')
from sklearn.datasets import load_iris from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_selection import SelectFromModel X, Y = load_iris(return_X_y=True) print(X.shape) model = ExtraTreesClassifier(n_estimators=50) model.fit(X, Y) print(model.feature_importances_) sfModel = SelectFromModel(model, prefit=True) X1 = sfModel.transform(X) print(X1.shape) print(X1)
def get_feature_importance(X, y): extree = ExtraTreesClassifier() extree.fit(X, y) return X, extree.feature_importances_
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier from sklearn.cross_validation import train_test_split from sklearn.cross_validation import StratifiedKFold import numpy as np from sklearn.metrics import roc_auc_score from sklearn.datasets.samples_generator import make_blobs '''创建训练的数据集''' data, target = make_blobs(n_samples=50000, centers=2, random_state=0, cluster_std=0.60) '''模型融合中使用到的各个单模型''' clfs = [ RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='gini'), RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'), ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='gini'), ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'), GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=5) ] '''切分一部分数据作为测试集''' X, X_predict, y, y_predict = train_test_split(data, target, test_size=0.33, random_state=2017) dataset_blend_train = np.zeros((X.shape[0], len(clfs))) dataset_blend_test = np.zeros((X_predict.shape[0], len(clfs))) '''5折stacking'''
# # accuracy_score(label_test,result) # # print (classification_report(label_test,result,digits=4)) # # scores = cross_val_score(clf, feature_matrix, labels) # # scores.mean() # # clf = ExtraTreesClassifier(n_estimators=150) # # scores = cross_val_score(clf, feature_matrix, labels, cv=10) # # scores.mean() # # clf = clf.fit(feature_train,label_train) # clf = svm.SVC(C=1.0,kernel='rbf',cache_size=1000,decision_function_shape='ovr',shrinking=True,probability=True) # scores = cross_val_score(clf,feature_matrix,labels,cv=StratifiedKFold(n_splits=4,shuffle=True)) # print (scores, scores.mean()) # clf.fit(feature_train, label_train) '''Extra-Trees''' clf = ExtraTreesClassifier(n_estimators=200,n_jobs=-1,max_features=30,criterion='gini') scores = cross_val_score(clf,feature_matrix,labels,cv=StratifiedKFold(n_splits=4,shuffle=True)) print (scores, scores.mean()) clf = clf.fit(feature_train,label_train) result = clf.predict(feature_test) accuracy_score(label_test,result) print (classification_report(label_test,result,digits=4)) print (clf.max_depth) clf.get_params()# print(classification_report_imbalanced(label_test, result)) clf.score(feature_test,label_test) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) #print ('hlo',clf.oob_score_) cm=sklearn.metrics.confusion_matrix(label_test,result ) print(cm) pl.matshow(cm)
le = LabelEncoder().fit(train.species) labels = le.transform(train.species) classes = list(le.classes_) test_ids = test.id train = train.drop(['id', 'species'], axis=1) test = test.drop(['id'], axis=1) sss = StratifiedShuffleSplit(labels, 10, test_size=0.2, random_state=23) for train_index, test_index in sss: X_train, X_test = train.values[train_index], train.values[test_index] y_train, y_test = labels[train_index], labels[test_index] trees = ExtraTreesClassifier(n_estimators=100, max_features=None, min_samples_split=1) trees.fit(X_train, y_train) train_predictions = trees.predict(X_test) accuracy = accuracy_score(y_test, train_predictions) print "Accuracy: {:.2%}".format(accuracy) train_prob = trees.predict_proba(X_test) loss = log_loss(y_test, train_prob) print "Log loss: {:10.4f}".format(loss) trees_predict = trees.predict_proba(test) submission = pd.DataFrame(trees_predict, columns=classes) submission.insert(0, 'id', test_ids)
#Random Forest clf_rf = RandomForestClassifier(n_estimators=1000, max_depth=None, min_samples_split=10) clf_rf = clf_rf.fit(X, y) score_rf = cross_val_score(clf_rf, X, y, cv=5).mean() print(score_rf) # In[ ]: #Extremely Randomised Trees clt_ext = ExtraTreesClassifier(max_features='auto', bootstrap=True, oob_score=True, n_estimators=1000, max_depth=None, min_samples_split=10) clt_ext.fit(X, y) score_ext = cross_val_score(clt_ext, X, y, cv=5).mean() print(score_ext) # In[ ]: #Gradient Boost import warnings warnings.filterwarnings clf_gb = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.1, max_depth=3,
X = ft.values X = np.random.permutation(X) X = np.random.permutation(X.T).T y0 = ft.index.map(lb.structure_name).values #%% for label, y in (('All', y0), ): #*((key, np.where(y0==key, key, 'Other')) for key in np.unique(y0))): #%% transcripts = [] scores = [] trials = 1000 for seed in tqdm(range(trials)): clf = ExtraTreesClassifier(n_estimators=50, random_state=seed, max_depth=5, criterion='entropy', min_impurity_decrease=0.05) clf.fit(X, y) dimred = SelectFromModel(clf, prefit=True, max_features=50) transcripts.extend(ft.T.index[dimred.get_support()]) scores.extend(clf.feature_importances_[dimred.get_support()]) df0 = pd.DataFrame() df0['transcripts'] = transcripts df0['scores'] = scores #%% def f(gp): return pd.DataFrame([[len(gp), max(gp['scores'])]],
for i in label_l: X[:, i] = labelencoder_X.fit_transform(X[:, i]) onehotencoder = OneHotEncoder(categorical_features=label_l) X = onehotencoder.fit_transform(X).toarray() # Encoding the Dependent Variable labelencoder_y = LabelEncoder() y = labelencoder_y.fit_transform(y) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X = sc.fit_transform(X) # Build a forest and compute the feature importances forest = ExtraTreesClassifier(n_estimators=250, random_state=0) forest.fit(X, y) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] print("Feature ranking:") for f in range(X.shape[1]): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) print("Features sorted by their score:") print( sorted(zip(map(lambda x: round(x, 4), forest.feature_importances_),
def main(): # load data and split into submission and training data X = np.load("data/bagOfWods_3000.npz")['X'][()] Y = np.load("data/bagOfWods_3000.npz")['y'][()] test_X = X[0:50000, :] # test_Y = Y[0:50000] train_X = X[50000:, :] train_Y = Y[50000:] # split into train/test X_train, X_temp, y_train, y_temp = train_test_split(train_X, train_Y, test_size=0.3, random_state=0) X_test, X_valid, y_test, y_valid = train_test_split(X_temp, y_temp, test_size=0.3, random_state=1) print("running model....") name_report = "report/report_%s.json" % "ensemble" with open(name_report, mode='w') as f: json.dump([], f) with open(name_report, mode='r') as modeljson: models = json.load(modeljson) start = time.time() # dictionary of different models with their parameters model_dic = { "randomForest": RandomForestClassifier(n_jobs=-1, n_estimators=3000, max_depth=10), "logistic": LogisticRegression(n_jobs=-1), "svmrdf": SVC(probability=True), "linearSVM": LinearSVC(), "extra": ExtraTreesClassifier(n_estimators=3000, max_depth=10, n_jobs=-1) } # parameter grid PARAM_GRID = { "randomForest": { 'n_estimators': [2000], 'max_depth': [8, 11] }, "logistic": {}, "svmrdf": { 'C': [0.1, 1] }, "linearSVM": {}, "extra": { 'n_estimators': [2000], 'max_depth': [8, 11] } } # loop through dictionary of models and fit the model on data for model_name, grid in model_dic.items(): print("now %s is running" % model_name) print(PARAM_GRID[model_name]) # grid = GridSearchCV(model, PARAM_GRID[model_name], scoring='accuracy', n_jobs=-1, cv=7) grid.fit(X_train, y_train) # output model model_file_name = "cache/%s.pkl" % (model_name) output = open(model_file_name, 'wb') pickle.dump(grid, output) output.close() if model_name == 'gbm' or model_name == 'svmrdf': result = grid.predict(X_test.toarray()) result_prob = grid.predict_proba(X_test.toarray()) else: result = grid.predict(X_test) accuracy = accuracy_score(y_test, result) end = time.time() time_delay = end - start submission_name = "submission/submission_file_%s.csv" % (model_name) report = { "model_name": model_name, "accuracy": accuracy, "time_delay": time_delay, "submission_name": submission_name } report_str = str(report) with open(name_report, mode='w') as modeljson: models.append(report_str) json.dump(models, modeljson) if model_name == 'gbm' or model_name == 'svmrdf': test_result = grid.predict(test_X.toarray()) else: test_result = grid.predict(test_X) # make a submission file. submission = pd.DataFrame({ 'id': np.arange(1, 50001), 'y': test_result }) submission.to_csv(submission_name, index=False)
#from sklearn.neural_network import MLPClassifier # #sizes = (200,100,100) #clfNN = MLPClassifier(solver='lbfgs', alpha=.015, # hidden_layer_sizes=sizes, random_state=15) #clfOne = OneVsRestClassifier(MLPClassifier(solver='lbfgs', alpha=.015, # hidden_layer_sizes=sizes, random_state=15), n_jobs = -1) # #clfNN.fit(X_train,y_train) #clfOne.fit(X_train,y_train) # #predicted_NN = clfNN.predict(X_test) #predicted_One = clfOne.predict(X_test) #%% Use TPOT to find best parameters/models clfExtra = make_pipeline( ExtraTreesClassifier(criterion="gini", max_features=0.53, n_estimators=500)) clfExtra.fit(X_train, y_train) predicted = clfExtra.predict(X_test) #%% from sklearn.metrics import confusion_matrix from classification_utilities import display_cm, display_adj_cm conf = confusion_matrix(y_test, predicted) display_cm(conf, facies_labels, hide_zeros=True) def accuracy(conf): total_correct = 0. nb_classes = conf.shape[0]
import numpy as np import pandas as pd from sklearn.ensemble import ExtraTreesClassifier from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from tpot.builtins import StackingEstimator from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Average CV score on the training set was:0.8385981283133181 exported_pipeline = make_pipeline( make_union(FunctionTransformer(copy), FunctionTransformer(copy)), ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=0.6500000000000001, min_samples_leaf=2, min_samples_split=11, n_estimators=800)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
from flask import Flask, render_template import flask #from flask.ext.sqlalchemy import SQLAlchemy import numpy as np import pandas as pd import os from sklearn.ensemble import ExtraTreesClassifier import modify_data #Model for Forest Identifying# train = pd.read_csv('full_cols.csv') train['Cover_Type'] = train['Cover_Type'].apply(str) X_train = train.drop(['Cover_Type', 'Unnamed: 0'],1) y_train = train['Cover_Type'] FOREST = ExtraTreesClassifier(n_estimators=200, random_state=42).fit(X_train, y_train) # End of Forest Model app = Flask(__name__) @app.route('/') def viz_page(): """ Visualization page for the app """ with open('visualization.html', 'r') as viz_file: return viz_file.read() @app.route('/test.html') def test_page(): with open('test.html', 'r') as test_file: return test_file.read()
def train_classifiers(X_data, y): """ Trains several classifiers and reporting model quality. :param X_data: :param y: :return: trained models """ # Split the dataset into Train and Test seed = 42 test_size = 0.3 X_train, X_test, y_train, y_test = train_test_split(X_data, y, test_size=test_size, random_state=seed) svm = SVC() svm_params = { 'C': [1, 10, 100, 1000], 'gamma': [1, 0.1, 0.001, 0.0001], 'kernel': ['linear', 'rbf'] } svm_model, svm_grid = train_single_classifier_type(svm, "SVM", svm_params, X_train, X_test, y_train, y_test) knn = KNeighborsClassifier() knn_params = { 'n_neighbors': [5, 6, 7, 8, 9, 10], 'leaf_size': [1, 2, 3, 5], 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'n_jobs': [-1] } knn_model, knn_grid = train_single_classifier_type(knn, "KNN", knn_params, X_train, X_test, y_train, y_test) # Train the XGboost Model for Classification xgb_model = xgb.XGBClassifier() # brute force scan for all parameters, here are the tricks # usually max_depth is 6,7,8 # learning rate is around 0.05, but small changes may make big diff # tuning min_child_weight subsample colsample_bytree can have # much fun of fighting against overfit # n_estimators is how many round of boosting # finally, ensemble xgboost with multiple seeds may reduce variance xgb_parameters = { 'nthread': [4], # when use hyperthread, xgboost may become slower 'objective': ['binary:logistic'], 'learning_rate': [0.05, 0.1], # so called `eta` value 'max_depth': [6, 7, 8], 'min_child_weight': [1, 11], 'silent': [1], 'subsample': [0.8], 'colsample_bytree': [0.7, 0.8], 'n_estimators': [5, 100, 1000], # number of trees, change it to 1000 for better results 'missing': [-999], 'seed': [1337] } train_model1, xgb_grid = train_single_classifier_type( xgb_model, "XGBoost", xgb_parameters, X_train, X_test, y_train, y_test) rfc = RandomForestClassifier() rfc_parameters = { 'max_depth': [4, 5, 6], 'n_estimators': [100, 200], 'criterion': ['gini', 'entropy'], 'max_features': ['auto', 'sqrt', 'log2'], 'min_samples_leaf': [2, 4], 'min_samples_split': [2, 5, 10], } rfc_model, rfc_grid = train_single_classifier_type(rfc, "Random Forest", rfc_parameters, X_train, X_test, y_train, y_test) ext = ExtraTreesClassifier() ext_parameters = { 'n_estimators': [50, 100], 'max_features': [5, 10, 25], 'min_samples_leaf': [2, 5, 10], 'min_samples_split': [2, 5, 10], } ext_model, ext_grid = train_single_classifier_type(ext, "Extra Trees", ext_parameters, X_train, X_test, y_train, y_test) lgbm = LGBMClassifier( boosting_type='gbdt', objective='binary', n_jobs=-1, # Updated from 'nthread' silent=True) # Create parameters to search lgbm_parameters = { 'max_depth': [5, 6, 7, 8, 9, 10, 15, 20], 'learning_rate': [0.005], 'n_estimators': [100, 150, 500], 'num_leaves': [6, 8, 12, 16], 'boosting_type': ['gbdt'], 'objective': ['binary'], 'random_state': [501], # Updated from 'seed' 'colsample_bytree': [0.65], 'subsample': [0.7], 'reg_alpha': [1, 10], 'reg_lambda': [10, 100], } lgbm_model, lgbm_grid = train_single_classifier_type( lgbm, "LGBM", lgbm_parameters, X_train, X_test, y_train, y_test) rgf = RGFClassifier() rgf_parameters = { 'max_leaf': [900], 'l2': [0.1, 0.05, 1.0], 'min_samples_leaf': [5, 4, 3], 'algorithm': ["RGF", "RGF_Opt", "RGF_Sib"], 'loss': ["Log"], } rgf_model, rgf_grid = train_single_classifier_type(rgf, "RGF", rgf_parameters, X_train, X_test, y_train, y_test) frgf = FastRGFClassifier() frgf_parameters = { 'max_leaf': [100, 200, 900], 'n_estimators': [100, 1000], 'max_bin': [10, 100], 'l2': [0.1, 100, 1000], 'min_samples_leaf': [5, 6], 'opt_algorithm': ['rgf'], 'loss': ["LS"], } frgf_model, frgf_grid = train_single_classifier_type( frgf, "FRGF", frgf_parameters, X_train, X_test, y_train, y_test) return svm_model, svm_grid, \ train_model1, xgb_grid, \ rfc_model, rfc_grid, \ ext_model, ext_grid, \ lgbm_model, lgbm_grid, \ rgf_model, rgf_grid, \ frgf_model, frgf_grid
def eval_trees_model(df): # perform k-fold validation kf = KFold(n=df.shape[0], n_folds=10, random_state=SEED, shuffle=True) acc_scores_log = np.zeros(10) acc_scores_rf = np.zeros(10) acc_scores_et = np.zeros(10) acc_scores_comb = np.zeros(10) fold_n = 0 # logistic regression model with defaults log_cl = LogisticRegression() # rf model rf_cl = RandomForestClassifier(n_estimators=200, min_samples_split=16, random_state=SEED) # Naive Bayes model et_cl = ExtraTreesClassifier(n_estimators=200, min_samples_split=16, random_state=SEED) for train_indices, fold_eval_indices in kf: print("Evaluating fold {} of {}".format(fold_n+1, 10)) # take a tfidf vectorisation of the text tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', decode_error='ignore', ngram_range=(1, 1), use_idf=1, smooth_idf=1, sublinear_tf=1) X_train = tfv.fit_transform(df['tweets_text'][train_indices]) X_eval = tfv.transform(df['tweets_text'][fold_eval_indices]) y_train = np.array(list(df['tweet_group'][train_indices])) y_eval = np.array(list(df['tweet_group'][fold_eval_indices])) log_cl.fit(X_train, y_train) log_preds = log_cl.predict(X_eval) log_proba = log_cl.predict_proba(X_eval) acc_scores_log[fold_n] = accuracy_score(y_eval, log_preds) # use the most important words to train RF classifier # take the max absolute value from all one-v-all subclassifiers coef = np.abs(log_cl.coef_).mean(0) important_words_ind = np.argsort(coef)[-200:] X_train_dense = X_train[:, important_words_ind].todense() X_eval_dense = X_eval[:, important_words_ind].todense() rf_cl.fit(X_train_dense, y_train) rf_preds = rf_cl.predict(X_eval_dense) rf_proba = rf_cl.predict_proba(X_eval_dense) acc_scores_rf[fold_n] = accuracy_score(y_eval, rf_preds) et_cl.fit(X_train_dense, y_train) et_preds = et_cl.predict(X_eval_dense) et_proba = et_cl.predict_proba(X_eval_dense) acc_scores_et[fold_n] = accuracy_score(y_eval, et_preds) # combine predictions by taking the maximum probabilities from both classifiers if not all(log_cl.classes_ == rf_cl.classes_): print("Error: different classes for classifiers. Combined predictions incorrect") comb_proba = 0.5*rf_proba + 0.5*et_proba comb_preds = [log_cl.classes_[i] for i in comb_proba.argmax(1)] acc_scores_comb[fold_n] = accuracy_score(y_eval, comb_preds) fold_n += 1 print("Mean Log Accuracy:{}, Std:{}".format(np.mean(acc_scores_log), np.std(acc_scores_log))) print("Mean RF Accuracy:{}, Std:{}".format(np.mean(acc_scores_rf), np.std(acc_scores_rf))) print("Mean Extra Trees Accuracy:{}, Std:{}".format(np.mean(acc_scores_et), np.std(acc_scores_et))) print("Mean Combined Accuracy:{}, Std:{}".format(np.mean(acc_scores_comb), np.std(acc_scores_comb)))
features = pd.DataFrame( pd.read_hdf('../Experiment Data/deephf_x.h5', key='deephf')) labels = pd.DataFrame( pd.read_hdf('../Experiment Data/deephf_y_' + cas9 + '.h5', key='deephf')) data = pd.concat([features, labels], axis=1, ignore_index=True) data = data.dropna().reset_index(drop=True) train_data, test_data = train_test_split(data, test_size=0.15, random_state=1, stratify=data.iloc[:, -1]) extraTree = ExtraTreesClassifier(n_estimators=500, n_jobs=-1, random_state=1, verbose=2) steps = [('SFM', SelectFromModel(estimator=extraTree)), ('scaler', StandardScaler()), ('SVM', SVC(C=10, gamma=0.001, kernel='rbf', cache_size=20000, verbose=True, max_mem_size=6000, probability=True))] train_x = train_data.iloc[:, :-1] train_y = train_data.iloc[:, -1]
import numpy as np from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import VotingClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier as kn data = pd.read_csv('red.csv') x = data[[ 'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol' ]] y = data['quality'] clf1 = GaussianNB() clf2 = ExtraTreesClassifier(n_estimators=82, max_depth=None, min_samples_split=1, random_state=0) clf3 = RandomForestClassifier(random_state=0, n_estimators=250, min_samples_split=1) clf4 = kn(n_neighbors=13) clf = VotingClassifier(estimators=[('gnb', clf1), ('et', clf2), ('rf', clf3), ('kn', clf4)], voting='soft', weights=[1, 8, 2, 1]).fit(x, y) test = pd.read_csv('red_test.csv') x = test[[ 'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol' ]]
# training and test datasets X_train, X_test, y_train, y_test = model_selection.train_test_split( features, labels, test_size=0.2, random_state=42, stratify=labels) # Support Vector Machine print('Support Vector Machine starting ...') cl = LinearSVC() run_classifier(cl, X_train, y_train, X_test, y_test, "CNN-SVM Accuracy: {0:0.1f}%", "SVM Confusion matrix") #Extra Trees print('Extra Trees Classifier starting ...') cl = ExtraTreesClassifier(n_jobs=1, n_estimators=10, criterion='gini', min_samples_split=2, max_features=50, max_depth=None, min_samples_leaf=1) run_classifier(cl, X_train, y_train, X_test, y_test, "CNN-ET Accuracy: {0:0.1f}%", "Extra Trees Confusion matrix") # Random Forest print('Random Forest Classifier starting ...') cl = RandomForestClassifier(n_jobs=1, criterion='entropy', n_estimators=10, min_samples_split=2) run_classifier(cl, X_train, y_train, X_test, y_test, "CNN-RF Accuracy: {0:0.1f}%", "Random Forest Confusion matrix")
from sklearn.pipeline import Pipeline from sklearn.ensemble import ExtraTreesClassifier # Generate data X, y = samples_generator.make_classification(n_samples=150, n_features=25, n_classes=3, n_informative=6, n_redundant=0, random_state=7) # Select top K features k_best_selector = SelectKBest(f_regression, k=9) # Initialize Extremely Random Forests classifier classifier = ExtraTreesClassifier(n_estimators=60, max_depth=4) # Construct the pipeline processor_pipeline = Pipeline([('selector', k_best_selector), ('erf', classifier)]) # Set the parameters processor_pipeline.set_params(selector__k=7, erf__n_estimators=30) # Training the pipeline processor_pipeline.fit(X, y) # Predict outputs for the input data output = processor_pipeline.predict(X) print("\nPredicted output:\n", output)
# * loss for the 2nd, 3rd, 4th, 5th best move, etc (perfect move is # less likely if there are several very close alternatives) modelnum = 0 for elo_name, elo_df in train_df.groupby(train_df['elo_groups']): subset_df = elo_df for cb in chunk_bounds: msg('working on elo group %s, of size %i. fitting model for error >= %f' % (elo_name, subset_df.shape[0], cb)) X = subset_df[features] y = (subset_df['clipped_movergain'] >= cb) rfc = True if rfc: extra = True if extra: clf = ExtraTreesClassifier(min_samples_split=200, min_samples_leaf=50, n_jobs=-1, n_estimators=NUM_ESTIMATORS, verbose=1) else: clf = RandomForestClassifier(min_samples_split=200, min_samples_leaf=50, n_jobs=-1, n_estimators=NUM_ESTIMATORS, verbose=1, oob_score=True) else: clf = GradientBoostingClassifier(min_samples_split=500, min_samples_leaf=300, n_estimators=NUM_ESTIMATORS, verbose=1, subsample=0.5, learning_rate=0.2) msg('CROSS VALIDATING') skf = StratifiedKFold(y, n_folds=2, shuffle=True) ins = [] outs = [] for train_index, test_index in skf: foo = clf.fit(X.iloc[train_index], y.iloc[train_index]) ins.append(average_precision_score(clf.predict(X.iloc[train_index]), y.iloc[train_index])) outs.append(average_precision_score(clf.predict(X.iloc[test_index]), y.iloc[test_index])) msg("insample average precision score: %s = %f" % (ins, np.mean(ins))) msg("outsample average precision score: %s = %f" % (outs, np.mean(outs)))
# Import statements required for Plotly import plotly.offline as py import plotly.graph_objs as go from plotly import tools # Loading some example data data = pd.read_csv('2clstrain1200.csv', header=None) names = ["Decision Tree", "Random Forest", "ExtraTrees"] # Creating a Python List with our three Tree classifiers treeclassifiers = [ DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=20, max_features=1), ExtraTreesClassifier() ] # X = data.iloc[:, 0:1] # y = data.iloc[:, 1] X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1) datasets = [ make_moons(noise=0.3, random_state=0), make_circles(noise=0.2, factor=0.5, random_state=1), make_blobs()
'features': ['numeric', 'categorical_encoded'], 'model': Sklearn(GradientBoostingRegressor(loss='lad', n_estimators=300, max_depth=7, max_features=0.2)), 'param_grid': {'n_estimators': (200, 400), 'max_depth': (6, 8), 'max_features': (0.1, 0.4)}, }, 'ab-ce': { 'features': ['numeric', 'categorical_encoded'], 'y_transform': y_log_ofs(200), 'model': Sklearn(AdaBoostRegressor(loss='linear', n_estimators=300)), 'param_grid': {'n_estimators': (50, 400), 'learning_rate': (0.1, 1.0)}, }, 'et-tst': { 'features': ['numeric'], # 'y_transform': y_log, 'model': Sklearn(ExtraTreesClassifier(2, max_features=0.2, n_jobs=-1)), }, 'et-ce': { 'features': ['numeric', 'categorical_encoded'], 'y_transform': y_log, 'model': Sklearn(ExtraTreesClassifier(200, max_features=0.2, n_jobs=-1)), }, 'et-ce-2': { 'features': ['numeric', 'categorical_encoded'], 'y_transform': y_log_ofs(200), 'model': Sklearn(ExtraTreesClassifier(200, max_features=0.2, n_jobs=-1)), },
params['max_depth'] = 3 print(cross_val_score(ExtraTreesClassifier(**params), X, Y, scoring=scoring)) params['max_depth'] = 4 print(cross_val_score(ExtraTreesClassifier(**params), X, Y, scoring=scoring)) params['max_depth'] = 5 print(cross_val_score(ExtraTreesClassifier(**params), X, Y, scoring=scoring)) exit(0)''' params = dict(n_estimators=1000, max_depth=3, class_weight='balanced', max_features=1, n_jobs=8) m = ExtraTreesClassifier(**params) m.fit(X, Y) # find threshold from sklearn.metrics import precision_recall_curve best_t = None precision, recall, thresholds = precision_recall_curve( Y, m.predict_proba(X)[:, 1]) for p, r, t in zip(precision, recall, thresholds): print(p, r, t) if r < 0.8: best_t = t break print('best threshold:', best_t) print('final', accuracy_score(Y, m.predict_proba(X)[:, 1] > best_t)) joblib.dump(m, 'regressor.joblib')
def _main(): np.random.seed(rs) logger.info("Running script for Approach 1") tr_df = pd.read_csv(os.path.join("data", "cs-training.csv"), index_col=0) te_df = pd.read_csv(os.path.join("data", "cs-test.csv"), index_col=0) tr_df, te_df = _preprocess_data(tr_df, te_df) # Add features tr_df, te_df = feats.add_features_based_on_NumOCLL(tr_df, te_df) tr_df, te_df = feats.add_features_based_on_NumRELL(tr_df, te_df) tr_df, te_df = feats.add_features_based_on_RUoUL(tr_df, te_df) # Preparing dataset for training excluded_cols = [ "age", "MonthlyIncome", "MonthlyIncome_Imputed", "SeriousDlqin2yrs" ] train_df = tr_df[tr_df.columns.difference(excluded_cols)] cols = train_df.columns.values.tolist() X, _ = utils.normalize_df(train_df) X = X.as_matrix() y = tr_df["SeriousDlqin2yrs"].values # Split sss = StratifiedShuffleSplit(n_splits=3, random_state=rs, test_size=0.3) for train_index, test_index in sss.split(X, y): X_train, X_valid, y_train, y_valid = X[train_index], X[test_index], y[ train_index], y[test_index] logger.info("X {}, train {}, valid {}" \ .format(X.shape, X_train.shape, X_valid.shape)) # Train logger.info("Features used for training : {}".format(cols)) base_estimators = [ ExtraTreesClassifier(n_estimators=400, n_jobs=-1, random_state=rs), LogisticRegressionCV(random_state=rs), RandomForestClassifier(bootstrap=True, criterion="gini", max_depth=None, max_features=5, n_estimators=150, n_jobs=-1, random_state=rs), # SVC(C=0.01, gamma=0.01, kernel="rbf", probability=True, # random_state=rs) ] # Each classifier is trained on 5 stratified splits # and the one (amongst the 5) with best AUC score is selected best_auc = 0.0 common_top_n_features = [] for est in base_estimators: fitted_est = utils.train_estimator(est, X_train, y_train, 5) top_n_features = [] top_n_features_df = utils.log_important_features(est, cols) if top_n_features_df.shape[0] > 0: top_n_features = top_n_features_df.head(15).feature.values.tolist() common_top_n_features.extend(top_n_features) common_top_n_features = list(set(common_top_n_features)) logger.info("{} common_top_n_features : {}" \ .format(len(common_top_n_features), common_top_n_features)) preds = fitted_est.predict(X_valid) score = roc_auc_score(y_valid, preds) logger.info("AUC : {:.5f}".format(score)) if score > best_auc: best_auc = score best_est = fitted_est logger.info("Best estimator : {}".format(best_est)) # Re-fitting the best estimator using the common top N features refit = False # TODO read from config if refit == True: logger.info("Re-fitting best estimator {} using top N features ..." \ .format(best_est.__class__.__name__)) X, _ = utils.normalize_df(train_df[common_top_n_features]) X = X.as_matrix() y = tr_df["SeriousDlqin2yrs"].values sss = StratifiedShuffleSplit(n_splits=3, random_state=rs, test_size=0.3) for train_index, test_index in sss.split(X, y): X_train, X_valid, y_train, y_valid = X[train_index], X[test_index], \ y[train_index], y[test_index] fitted_best_est = utils.train_estimator(best_est, X_train, y_train, 5) preds = fitted_best_est.predict(X_valid) score = roc_auc_score(y_valid, preds) logger.info("AUC : {:.5f}".format(score)) if score > best_auc: best_auc = score best_est = fitted_est # Getting the predictions logger.info("Get the predictions using {} ...".format(best_est)) te_df_, _ = utils.normalize_df(te_df[cols]) identifiers = te_df_.index.tolist() if refit == True: p = [ x[1] for x in best_est.predict_proba(te_df_[common_top_n_features]) ] else: p = [x[1] for x in best_est.predict_proba(te_df_)] _prepare_submission_file(identifiers, p)
# 4-fold cross_validation for j in xrange(fold_ids.shape[1]): fold = j + 1 val_ids = fold_ids.ix[:, j].dropna() idx = train["ID"].isin(list(val_ids)) trainingSet = train[~idx] validationSet = train[idx] et = ExtraTreesClassifier(n_estimators=2000, criterion="entropy", max_depth=50, max_features=0.9, min_samples_split=3, min_samples_leaf=5, bootstrap=False, oob_score=False, random_state=112, verbose=0, n_jobs=-1) et.fit(trainingSet[feature_names], np.array(trainingSet["target"])) preds = et.predict_proba(validationSet[feature_names])[:, 1] ll = log_loss(np.array(validationSet["target"]), preds) print "# Data_version : {0} | Fold : {1} | log_loss : {2}".format( i + 1, j + 1, ll) df = pd.DataFrame({ "Fold": np.repeat((j + 1), validationSet.shape[0]), "ID":
X_test = X[n_train:] y_test = y[n_train:] # Standardize first 10 features (the numerical ones) mean = X_train.mean(axis=0) std = X_train.std(axis=0) mean[10:] = 0.0 std[10:] = 1.0 X_train = (X_train - mean) / std X_test = (X_test - mean) / std return X_train, X_test, y_train, y_test ESTIMATORS = { 'RandomForest': RandomForestClassifier(n_estimators=100), 'ExtraTreesClassifier': ExtraTreesClassifier(n_estimators=100) } X_train, X_test, y_train, y_test = load_data() BACKENDS = [('threading', Parallel, {}), ('dask.distributed', Parallel2, {'scheduler_host': SCHEDULER_ADDRESS, 'scatter': [X_train]})] if __name__ == "__main__": print("Dataset statistics:") print("===================") print("%s %d" % ("number of features:".ljust(25), X_train.shape[1])) print("%s %d" % ("number of classes:".ljust(25), np.unique(y_train).size)) print("%s %s" % ("data type:".ljust(25), X_train.dtype)) print("%s %d (pos=%d, neg=%d, size=%dMB)"
mlb = MultiLabelBinarizer() y = mlb.fit_transform(y_train) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2,random_state=random_state) n_train = len(y_train) n_test = len(y_test) kfs = list(KFold(n_train, n_folds=5)) clfs = [ OneVsRestClassifier(RandomForestClassifier(n_estimators=512,criterion='entropy',max_depth=8), n_jobs=cores), OneVsRestClassifier(RandomForestClassifier(n_estimators=1024,criterion='gini',max_depth=8), n_jobs=cores), OneVsRestClassifier(svm.SVC(kernel='linear', C= 4.0, probability=True), n_jobs=cores), OneVsRestClassifier(svm.SVC(kernel='rbf', C= 2.0, gamma = np.power(2.0, -8.075),probability=True), n_jobs=cores), OneVsRestClassifier(ExtraTreesClassifier(n_estimators=512,criterion='entropy',max_depth=16), n_jobs=cores), OneVsRestClassifier(GradientBoostingClassifier(n_estimators=512,learning_rate=0.01 , max_depth=8), n_jobs=cores) ] #meta features blend_train_X = None blend_train_y = None blend_test = None for j, clf in enumerate(clfs): print j, clf blend_test_j = None blend_train_X_j = None for i, (train, test) in enumerate(kfs):