0 / 0 feature_top_n = get_top_n_features(titanic_train_data_X, titanic_train_data_Y, feature_to_pick) titanic_train_data_X = titanic_train_data_X[feature_top_n] titanic_test_data_X = titanic_test_data_X[feature_top_n] #oversampling #titanic_train_data_X, titanic_train_data_Y = RandomOverSampler().fit_sample(titanic_train_data_X, titanic_train_data_Y) titanic_train_data_X, titanic_train_data_Y = SMOTE().fit_sample( titanic_train_data_X, titanic_train_data_Y) #voting #xgb_est = xgb.XGBClassifier(learning_rate=0.03, random_state=3, n_estimators=900, subsample=0.8, n_jobs = 50,colsample_bytree = 0.8, max_depth = 10, verbose=1) #svm_est = svm.SVC(kernel='rbf', gamma = 1e-3, C =100) ada_est = ensemble.AdaBoostClassifier(n_estimators=1000, random_state=3, learning_rate=0.1) rf_est = ensemble.RandomForestClassifier(n_estimators=1000, criterion='gini', max_features='sqrt', max_depth=10, min_samples_split=4, min_samples_leaf=20, n_jobs=50, random_state=42, verbose=1) gbm_est = ensemble.GradientBoostingClassifier(n_estimators=1000, learning_rate=0.003, loss='exponential', min_samples_split=3, min_samples_leaf=20,
testing_data = count_vect.transform(X_test) print(testing_data) #training_data = training_data.astype(float) #y = y.as_matrix().astype(np.float) X_train_counts.shape X_train.shape y_train.shape y_train = np.nan_to_num(y_train) np.isnan(y_train).any() training_data.data = np.nan_to_num(training_data.data) ################################## #ada boost ada_estimator = ensemble.AdaBoostClassifier( base_estimator=tree.DecisionTreeClassifier(), random_state=100) ada_grid = { 'n_estimators': list(range(50, 101, 50)), 'learning_rate': [0.1, 0.2, 1.0], 'base_estimator__max_depth': [1, 3, 5], 'base_estimator__criterion': ['entropy', 'gini'] } ada_grid_estimator = model_selection.GridSearchCV(ada_estimator, ada_grid, scoring='accuracy', cv=10, return_train_score=True) ada_grid_estimator.fit(training_data, y_train) print(ada_grid_estimator.best_score_) print(ada_grid_estimator.best_params_)
from sklearn import ensemble import utils # this is pure shit. training_points, training_labels = utils.get_training_data('../train_2008.csv') test_points = utils.get_test_points('../test_2008.csv') clf = ensemble.AdaBoostClassifier() # uses DT by default clf.fit(training_points, training_labels) utils.prepare_submission_sklearn(clf.predict, test_points)
np.random.seed(10) #fetch dataframes df_train = pd.read_csv(loc_train) df_test = pd.read_csv(loc_test) #shuffle train df to prevent malordered samples df_train = df_train.reindex(np.random.permutation(df_train.index)) #get the feature columns feature_cols = [col for col in df_train.columns if col not in ['class']] #create a train and test set X_train = df_train[feature_cols] X_test = df_test[feature_cols] #fetch the labels into 'y' y = df_train['class'] #classifier config and fitting clf_base = ensemble.RandomForestClassifier(n_estimators=1050, criterion="entropy", max_features=None, random_state=777, n_jobs=-1) clf = ensemble.AdaBoostClassifier(clf_base, n_estimators=4, random_state=93) print "\nFitting:\n", clf, "\non train set shaped:\n", X_train.shape clf.fit(X_train, y) #predicting and storing results with open(loc_submission, "wb") as outfile: print "\nPredicting on test set shaped:\n", X_test.shape, "\nWriting to:", outfile outfile.write("Id,Class\n") for e, val in enumerate(clf.predict_proba(X_test)): outfile.write("%s,%f\n" % (float(e + 1), float(val[1]))) print "\nScript running time:", datetime.now() - start
def EntireDataset(): a = input('Click and drag FEATURE SELECTED ENTIRE DATASET file here: ') a = a.strip('\' ') data = pd.read_csv(a, encoding='utf-8').set_index('PATIENT') b = input('Click and drag LABELS file here: ') b = b.strip('\' ') labels_df = pd.read_csv(b, encoding='utf-8').set_index('PATIENT') labels = np.array(labels_df[labels_df.columns[0]]) nfeatsmax = len(data.columns) nfeatsneural = round((nfeatsmax * 2 / 3)) rf = ensemble.RandomForestClassifier(max_features=nfeatsmax, max_depth=5, bootstrap=False) et = ensemble.ExtraTreesClassifier(max_features=nfeatsmax, max_depth=5, bootstrap=False) kn = neighbors.KNeighborsClassifier(n_neighbors=nfeatsmax, p=1) nb = naive_bayes.GaussianNB() dt = tree.DecisionTreeClassifier(max_features=nfeatsmax, max_depth=5, criterion='entropy') ls = svm.LinearSVC(penalty='l1', dual=False) gb = ensemble.GradientBoostingClassifier(loss='exponential', max_depth=2) nn = neural_network.MLPClassifier(hidden_layer_sizes=( nfeatsneural, nfeatsneural, nfeatsneural, ), learning_rate_init=0.0001, max_iter=500) ab = ensemble.AdaBoostClassifier() bc = ensemble.BaggingClassifier(base_estimator=rf) vc = ensemble.VotingClassifier(estimators=[('gb', gb), ('ab', ab), ('bc', bc)], voting='soft') estimators = { #'randomforest': rf, #'extratrees': et, #'kneighbors': kn, 'naivebayes': nb, #'decisiontree': dt, #'linearsvc': ls, #'gboost': gb, #'neuralnet': nn, #'adaboost': ab, #'bagging': bc, #'voting': vc, } results = { 'estimator': [], 'subjects': [], 'labels': [], 'predictions': [], 'scores': [], 'attempts': [] } for j, k in zip(estimators.keys(), estimators.values()): k.fit(data, labels) predict_train = k.predict(data) train_scores = [ 1 if x == y else 0 for x, y in zip(labels, predict_train) ] results['estimator'].extend([j] * len(data)) results['subjects'].extend(data.index) results['labels'].extend(labels) results['predictions'].extend(predict_train) results['scores'].extend(train_scores) results['attempts'].extend([1] * len(data)) results_df = pd.DataFrame.from_dict(results).set_index('subjects') results_df.to_csv( path_or_buf= '/media/james/ext4data/current/projects/pfizer/combined-study/entire_dataset_results.csv' ) with open( '/media/james/ext4data/current/projects/pfizer/combined-study/trainedclassifier.pickle', 'wb') as f: pickle.dump(k, f, pickle.HIGHEST_PROTOCOL) print('ENTIRE DATASET ACCURACY') trd = results_df.groupby('estimator').sum() trsum = (trd['scores'] / trd['attempts']) * 100 print(trsum) return
res = [] for clf in clfs: clf.fit(data, target) res.append(clf.predict(test)) pred = [most_common(x) for x in zip(*res)] f = open('final-predictions.csv', 'w') f.write("ID,Category\n") for i, res in enumerate(pred): f.write("%d,%d\n" % (i + 1, res)) f.close() clfs = [] # Through cv testing, I found the optimal number of estimators to be 15 clfs.append(ensemble.RandomForestClassifier(n_estimators=150)) clfs.append(ensemble.GradientBoostingClassifier(n_estimators=200)) clfs.append(ensemble.AdaBoostClassifier(n_estimators=135)) #clfs.append(neighbors.KNeighborsClassifier(n_neighbors=10)) #clfs.append(svm.SVC()) predictificate(data, target, test, clfs) # I use the following code to find good hyperparameter values #scores = cross_validation.cross_val_score( #clf, data, target, cv=5) #print("Accuracy: %0.2f (+/- %0.2f) %f" % (scores.mean(), scores.std() * 2, x))
axis=1, inplace=False) titanic2.shape X_train = titanic2[0:titanic_train.shape[0]] X_train.shape X_train.info() y_train = titanic_train['Survived'] #create estimators for voting classifier #M1 dt_estimator = tree.DecisionTreeClassifier(random_state=100) #M2 rf_estimator = ensemble.RandomForestClassifier(random_state=100) #M3 ada_estimator = ensemble.AdaBoostClassifier(random_state=100) #voting classifier voting_estimator = ensemble.VotingClassifier( estimators=[('dt', dt_estimator), ('rf', rf_estimator), ('ada', ada_estimator)]) voting_grid = { 'dt__max_depth': [3, 5, 7], 'rf__n_estimators': [20], 'rf__max_features': [5, 7, 9], 'rf__max_depth': [2, 4, 6], 'ada__n_estimators': [20] } grid_voting_estimator = model_selection.GridSearchCV(voting_estimator, voting_grid, verbose=1,
##fourth classifier strong for precision and recall (based on final features list)- use findings from 2nd classifier from sklearn import ensemble, tree #from sklearn.grid_search import GridSearchCV #parameter = {'algorithm':['SAMME', 'SAMME.R'], # 'n_estimators':[2,5,10,25,50]} #clf = GridSearchCV(ensemble.AdaBoostClassifier(base_estimator=tree.DecisionTreeClassifier(criterion='entropy'), # n_estimators=50),parameter) #clf = clf.fit(features,labels) #print clf.best_estimator_ ##fifth classifier strong for precision and recall (based on final features list)- use findings from classifiers 2&4 from sklearn import ensemble, tree tree = tree.DecisionTreeClassifier(criterion='entropy') clf = ensemble.AdaBoostClassifier(base_estimator=tree, algorithm='SAMME', n_estimators=50) clf = clf.fit(features, labels) print clf.feature_importances_ print features_list[1:] ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. ### Because of the small size of the dataset, the script uses stratified ### shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html ##creating new test classifier using kfold cross validation PERF_FORMAT_STRING = "\ \tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\ Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
titanic2 = titanic1.drop([ 'PassengerId', 'Name', 'Age', 'Ticket', 'Cabin', 'Survived', 'SibSp', 'Parch', 'Fare' ], axis=1, inplace=False) titanic2.info() X_train = titanic2[0:titanic_train.shape[0]] X_train.shape X_train.info() y_train = titanic_train['Survived'] #Base Model..Building dt_estimator = tree.DecisionTreeClassifier(random_state=2017) #Model Building ada_estimator = ensemble.AdaBoostClassifier(random_state=2017, base_estimator=dt_estimator) ada_grid = { 'n_estimators': [50, 100], 'learning_rate': [0.123, 0.5344, 0.789], 'base_estimator__max_depth': [3, 4] } grid_ada_estimator = model_selection.GridSearchCV(ada_estimator, ada_grid, cv=10, n_jobs=1) grid_ada_estimator.fit(X_train, y_train) print(grid_ada_estimator.grid_scores_) print(grid_ada_estimator.best_score_) #83 print(grid_ada_estimator.best_params_) grid_ada_estimator.best_estimator_ #83 print(grid_ada_estimator.score(X_train, y_train)) #83
train1_x, test1_x, train1_y, test1_y = train_test_split(train[train_x_calc], train[target], random_state = 0) train1_x_bin, test1_x_bin, train1_y_bin, test1_y_bin = train_test_split(train[train_x_bin], train[target] , random_state = 0) train1_x_dummy, test1_x_dummy, train1_y_dummy, test1_y_dummy = train_test_split(train_dummy[train_x_dummy], train[target], random_state = 0) #Discrete Variable Correlation by Survival using group by aka pivot table for x in train_x: if train[x].dtype != 'float64' : print('Survival Correlation by:', x) print(train[[x, target[0]]].groupby(x, as_index=False).mean()) print('-'*10, '\n') # Machine Learning Algorithm (MLA) Selection and Initialization MLA = [ # Ensemble Methods ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(), # Gaussian Processes gaussian_process.GaussianProcessClassifier(), # GLM linear_model.LogisticRegressionCV(), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), linear_model.Perceptron(),
clf = DecisionTreeClassifier(max_depth=3) clf.fit(X_train, y_train) scores = model_selection.cross_val_score(clf, X_test, y_test, cv=10) print(scores) #%% #Testing Gausian Naiive bayes gnb = naive_bayes.GaussianNB() gnb.fit(X_train, y_train) scores = model_selection.cross_val_score(gnb, X_test, y_test, cv=10) print(scores) #%% d_tree = DecisionTreeClassifier(max_depth=3) aba = ensemble.AdaBoostClassifier(base_estimator=d_tree, n_estimators=50) params = { "base_estimator__criterion": ["gini", "entropy"], "base_estimator__splitter": ["best", "random"] } cv_n = model_selection.KFold(n_splits=20, shuffle=True) cv_search = model_selection.GridSearchCV(aba, param_grid=params, scoring="average_precision", cv=cv_n, refit=True, n_jobs=2) cv_search.fit(X_train, y_train) print(cv_search.score(X_test, y_test))
# convert into a large array training_X = convert_image_list( training_images ) training_Y = np.ravel( np.concatenate( tuple(j for j in training_output ) ) ) if options.debug: print("Fitting...") if options.method=="SVM": clf = svm.SVC() elif options.method=="nuSVM": clf = svm.NuSVC() elif options.method=='NN': clf = neighbors.KNeighborsClassifier(options.n) elif options.method=='RanForest': clf = ensemble.RandomForestClassifier(n_estimators=options.n,random_state=options.random) elif options.method=='AdaBoost': clf = ensemble.AdaBoostClassifier(n_estimators=options.n,random_state=options.random) elif options.method=='tree': clf = tree.DecisionTreeClassifier(random_state=options.random) else: clf = svm.LinearSVC() #scores = cross_validation.cross_val_score(clf, training_X, training_Y) #print scores clf.fit( training_X, training_Y ) #print(clf.score(training_X,training_Y)) if options.debug: print( clf ) with open(options.save,'wb') as f:
titanic_train.info() titanic_train1 = pd.get_dummies(titanic_train, columns=['Pclass', 'Sex', 'Embarked']) titanic_train1.shape titanic_train1.info() titanic_train1.head(6) X_train = titanic_train1.drop( ['PassengerId', 'Age', 'Cabin', 'Ticket', 'Name', 'Survived'], 1) y_train = titanic_train['Survived'] #Note that we take entire data into consideration in boosting. That's why we have to cut the tree depth to control the overfitting. #In this case we are giving max_depth=3 dt_estimator = tree.DecisionTreeClassifier(max_depth=3) ada_tree_estimator1 = ensemble.AdaBoostClassifier(dt_estimator, 5) #Parameter tuning #n_estimators(no. of trees to grow), learning_rate(Learning rate shrinks the contribution of each classifier by learning_rate.) #There is a trade-off between learning_rate and n_estimators. #Pass the learning_rate less than default which is 1. ada_grid = {'n_estimators': [5, 8, 10, 12], 'learning_rate': [0.1, 0.5, 0.9]} ada_grid_estimator = model_selection.GridSearchCV(ada_tree_estimator1, ada_grid, cv=10, n_jobs=1) ada_grid_estimator.fit(X_train, y_train) ada_grid_estimator.cv_results_ ada_grid_estimator.best_score_ ada_grid_estimator.best_params_
def get_top_n_features(titanic_train_data_X, titanic_train_data_Y, top_n_features): # 随机森林 rf_est = RandomForestClassifier(random_state=3) rf_param_grid = { 'n_estimators': [500], 'max_features': [5, 6, 10], 'min_samples_split': [2, 3], 'max_depth': [20] } rf_grid = model_selection.GridSearchCV(rf_est, rf_param_grid, n_jobs=25, cv=10, verbose=1) rf_grid.fit(titanic_train_data_X, titanic_train_data_Y) # 将feature按Importance排序 feature_imp_sorted_rf = pd.DataFrame({ 'feature': list(titanic_train_data_X), 'importance': rf_grid.best_estimator_.feature_importances_ }).sort_values('importance', ascending=False) features_top_n_rf = feature_imp_sorted_rf.head(top_n_features)['feature'] print('Sample 25 Features from RF Classifier') print(str(features_top_n_rf[:25])) # AdaBoost ada_est = ensemble.AdaBoostClassifier(random_state=42) ada_param_grid = {'n_estimators': [500], 'learning_rate': [0.5, 0.6]} ada_grid = model_selection.GridSearchCV(ada_est, ada_param_grid, n_jobs=25, cv=10, verbose=1) ada_grid.fit(titanic_train_data_X, titanic_train_data_Y) # 排序 feature_imp_sorted_ada = pd.DataFrame({ 'feature': list(titanic_train_data_X), 'importance': ada_grid.best_estimator_.feature_importances_ }).sort_values('importance', ascending=False) features_top_n_ada = feature_imp_sorted_ada.head(top_n_features)['feature'] print('Sample 25 Features from ADA Classifier:') print(str(features_top_n_ada[:25])) # ExtraTree et_est = ensemble.ExtraTreesClassifier(random_state=42) et_param_grid = { 'n_estimators': [500], 'min_samples_split': [3, 4], 'max_depth': [15] } et_grid = model_selection.GridSearchCV(et_est, et_param_grid, n_jobs=25, cv=10, verbose=1) et_grid.fit(titanic_train_data_X, titanic_train_data_Y) # 排序 feature_imp_sorted_et = pd.DataFrame({ 'feature': list(titanic_train_data_X), 'importance': et_grid.best_estimator_.feature_importances_ }).sort_values('importance', ascending=False) features_top_n_et = feature_imp_sorted_et.head(top_n_features)['feature'] print('Sample 25 Features from ET Classifier:') print(str(features_top_n_et[:25])) # 将三个模型挑选出来的前features_top_n_et合并 features_top_n = pd.concat( [features_top_n_rf, features_top_n_ada, features_top_n_et], ignore_index=True).drop_duplicates() return features_top_n
skf = list(CV.StratifiedKFold(trainingY, 5)) rf_param = [2000, 12] #n_est, depth gb_param = [400, 4, 'auto'] #n_est, depth ada_param = [1000] #n_est clfs = [ ensemble.RandomForestClassifier(n_estimators=rf_param[0], n_jobs=16, max_depth=rf_param[1], max_features=0.5, random_state=1126), ensemble.GradientBoostingClassifier(n_estimators=gb_param[0], max_depth=gb_param[1], max_features=gb_param[2], random_state=1126), ensemble.AdaBoostClassifier(n_estimators=ada_param[0], random_state=1126) ] dataset_blend_train = np.zeros((trainingX.shape[0], len(clfs))) dataset_blend_test = np.zeros((testingX.shape[0], len(clfs))) # Cross-validation for j, clf in enumerate(clfs): print j, clf print >> log_f, clf dataset_blend_test_j = np.zeros((testingX.shape[0], len(skf))) for i, (train, test) in enumerate(skf): print "Fold", i X_fold_train = trainingX[train] y_fold_train = trainingY[train] X_fold_test = trainingX[test]
def main(): train_x ,train_y = helpers.load_data() alg = ensemble.AdaBoostClassifier() analysis.param_search(alg,train_x,train_y)
def plotting(X, Y, Xt, Yt, labelx, labely, outputfile): h = .02 # step size in the mesh classifiers = dict(knn=neighbors.KNeighborsClassifier(4), logistic=linear_model.LogisticRegression(C=1e5), svm=svm.SVC(C=1e5), adaboost=ensemble.AdaBoostClassifier(), naivebay=naive_bayes.GaussianNB()) cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) fignum = 1 # we create an instance of Neighbours Classifier and fit the data. for name, clf in classifiers.iteritems(): clf.fit(X, Y) score = clf.score(Xt, Yt) if score > 0.85: print '....... plotting for ' + name pl.cla() pl.clf() # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) pl.figure(fignum, figsize=(4, 3)) pl.pcolormesh(xx, yy, Z, cmap=cmap_light) # Plot also the training points pl.scatter(X[:, 0], X[:, 1], s=30, c=Y, edgecolors='k', cmap=cmap_bold) pl.xlim(xx.min(), xx.max()) pl.ylim(yy.min(), yy.max()) pl.xticks(()) pl.yticks(()) fignum += 1 pl.ylabel(labely) pl.xlabel(labelx) pl.text(xx.min(), yy.min(), name + " - Accuracy " + str(round(score, 2)), ha='left', fontsize=14, style='italic') if score > 0.95: pl.savefig(outputfile + '_' + name + '_SUPERGOOD.png', orientation='landscape') else: pl.savefig(outputfile + '_' + name + '.png', orientation='landscape') return score, name
def multi_classifier_voting_predication(data1, data1_x_bin, cv_split, Target): # why choose one model, when you can pick them all with voting classifier # http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html # removed models w/o attribute 'predict_proba' required for vote classifier and models with a 1.0 correlation to another model vote_est = [ # Ensemble Methods: http://scikit-learn.org/stable/modules/ensemble.html ('ada', ensemble.AdaBoostClassifier()), ('bc', ensemble.BaggingClassifier()), ('etc', ensemble.ExtraTreesClassifier()), ('gbc', ensemble.GradientBoostingClassifier()), ('rfc', ensemble.RandomForestClassifier()), # Gaussian Processes: http://scikit-learn.org/stable/modules/gaussian_process.html#gaussian-process-classification-gpc ('gpc', gaussian_process.GaussianProcessClassifier()), # GLM: http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression ('lr', linear_model.LogisticRegressionCV()), # Navies Bayes: http://scikit-learn.org/stable/modules/naive_bayes.html ('bnb', naive_bayes.BernoulliNB()), ('gnb', naive_bayes.GaussianNB()), # Nearest Neighbor: http://scikit-learn.org/stable/modules/neighbors.html ('knn', neighbors.KNeighborsClassifier()), # SVM: http://scikit-learn.org/stable/modules/svm.html ('svc', svm.SVC(probability=True)), # xgboost: http://xgboost.readthedocs.io/en/latest/model.html ('xgb', XGBClassifier()) ] # Hard Vote or majority rules vote_hard = ensemble.VotingClassifier(estimators=vote_est, voting='hard') vote_hard_cv = model_selection.cross_validate(vote_hard, data1[data1_x_bin], data1[Target], cv=cv_split, return_train_score=True) vote_hard.fit(data1[data1_x_bin], data1[Target]) print("Hard Voting Training w/bin score mean: {:.2f}".format( vote_hard_cv['train_score'].mean() * 100)) print("Hard Voting Test w/bin score mean: {:.2f}".format( vote_hard_cv['test_score'].mean() * 100)) print("Hard Voting Test w/bin score 3*std: +/- {:.2f}".format( vote_hard_cv['test_score'].std() * 100 * 3)) print('-' * 10) # Soft Vote or weighted probabilities vote_soft = ensemble.VotingClassifier(estimators=vote_est, voting='soft') vote_soft_cv = model_selection.cross_validate(vote_soft, data1[data1_x_bin], data1[Target], cv=cv_split, return_train_score=True) vote_soft.fit(data1[data1_x_bin], data1[Target]) print("Soft Voting Training w/bin score mean: {:.2f}".format( vote_soft_cv['train_score'].mean() * 100)) print("Soft Voting Test w/bin score mean: {:.2f}".format( vote_soft_cv['test_score'].mean() * 100)) print("Soft Voting Test w/bin score 3*std: +/- {:.2f}".format( vote_soft_cv['test_score'].std() * 100 * 3)) print('-' * 10) return vote_hard, vote_soft
'Edited Nearest Neighbours', 'Repeated Edited Nearest Neighbours', 'All KNN', 'Instance Hardness Threshold', 'Neighbour hood Cleaning Rule' , 'OneSidedSelection', 'Random Under Sampler', 'TomekLinks(random_state=42)' ] params = {'n_estimators': 10, 'max_depth': 3, 'subsample': 0.5, 'learning_rate': 0.89, 'min_samples_leaf': 1, 'random_state': 5} clfs = [ ensemble.GradientBoostingClassifier(**params), BernoulliNB(), DecisionTreeClassifier(random_state=0), svm.SVC(kernel='rbf', probability=True), SGDClassifier(loss="modified_huber",penalty='l1'), RandomForestClassifier(n_estimators=9), ensemble.AdaBoostClassifier(), svm.SVC(kernel='linear', probability=True), MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(150,50,15,5,3), random_state=1), neighbors.KNeighborsClassifier(n_neighbors=5), NearestCentroid(metric='euclidean', shrink_threshold=None), GaussianNB(), LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis() ] clfs_name = ['GradientBoostingClassifier', 'Bernoulli Naive Bayes', 'DecisionTreeClassifier', 'SVM (rbf)', 'Stochastic Gradient Descent', 'Radom Forest Classifier', 'Ada Boost Classifier', 'SVM (linear)' , 'Multi Layer Perceptron', 'K Nearest Neighbors', 'Nearest Centroid Classifier', 'Guassian Naive Bayes',
def adaBoostClassifierAlgorithm(): from sklearn import ensemble algorithmFitPredAndShow(ensemble.AdaBoostClassifier() , "AdaBoostClassifier")
batch_size=548, epochs=1000000, shuffle=True, validation_data=(np.array(x_mlpval), krsutil.to_categorical(y_mlpval)), callbacks=callbacks_list, verbose=2) #AdaBoost algorithm with a decision tree as the base classifier for the first task, including several rounds of grid search. #ensembletree_model=sklensemble.AdaBoostClassifier(skltree.DecisionTreeClassifier(random_state=42), random_state=42) #F1 0.728 at first. #ensembletree_model=sklensemble.AdaBoostClassifier(skltree.DecisionTreeClassifier(max_depth=2, random_state=42), random_state=42) #Validation F1 0.484 at first. #ensembletree_model=sklensemble.AdaBoostClassifier(skltree.DecisionTreeClassifier(max_depth=3, random_state=42), random_state=42) #Validation F1 0.745 at first. #ensembletree_model=sklensemble.AdaBoostClassifier(skltree.DecisionTreeClassifier(max_depth=4, random_state=42), random_state=42) #Validation F1 0.775 at first. #ensembletree_model=sklensemble.AdaBoostClassifier(skltree.DecisionTreeClassifier(max_depth=5, random_state=42), random_state=42) #Validation F1 0.779 at first. #ensembletree_model=sklensemble.AdaBoostClassifier(skltree.DecisionTreeClassifier(max_depth=6, random_state=42), random_state=42) #Validation F1 0.780 at first. ensembletree_model = sklensemble.AdaBoostClassifier( skltree.DecisionTreeClassifier(max_depth=7, random_state=42), random_state=42) #Validation F1 0.787 at first. #ensembletree_model=sklensemble.AdaBoostClassifier(skltree.DecisionTreeClassifier(max_depth=8, random_state=42), random_state=42) #Validation F1 0.764 at first. #ensembletree_model=sklensemble.AdaBoostClassifier(skltree.DecisionTreeClassifier(max_depth=9, random_state=42), random_state=42) #Validation F1 0.774 at first. #ensembletree_model=sklensemble.AdaBoostClassifier(skltree.DecisionTreeClassifier(max_depth=10, random_state=42), random_state=42) #Validation F1 0.775 at first. #ensembletree_model=sklensemble.AdaBoostClassifier(skltree.DecisionTreeClassifier(max_depth=11, random_state=42), random_state=42) #Validation F1 0.770 at first. #ensembletree_model=sklensemble.AdaBoostClassifier(skltree.DecisionTreeClassifier(max_depth=12, random_state=42), random_state=42) #Validation F1 0.759 at first. #ensembletree_model=sklensemble.AdaBoostClassifier(skltree.DecisionTreeClassifier(max_depth=13, random_state=42), random_state=42) #Validation F1 0.751 at first. #ensembletree_hyper={'n_estimators':[2,3,4,5,6,7,8,9,10,50,100,200,400,800,1000]} #ensembletree_hyper={'n_estimators':[50,100,150]} #ensembletree_hyper={'n_estimators':[90,100,110]} #ensembletree_hyper={'n_estimators':[95,100,105]} ensembletree_hyper = { 'n_estimators': [95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105] } ensembletree_scorer = sklmet.make_scorer(sklmet.f1_score, average='macro')
predict_y = model.fit(train_X, train_failed_y).predict(test_X) #只看挂科这一类分类效果好 坏 predict_failed_y = [1 - x for x in predict_y] print("------ExtraTree---------") f1 = f1_score(test_failed_y, predict_failed_y) print("f1:%f" % f1) macro_auc = roc_auc_score(test_failed_y, predict_failed_y, average="macro") print("macro auc:%f" % macro_auc) accuracy = accuracy_score(test_failed_y, predict_failed_y) print("accuracy:%f" % accuracy) precision = precision_score(test_failed_y, predict_failed_y) print("precision:%f" % precision) recall = recall_score(test_failed_y, predict_failed_y) print("recall:%f" % recall) model = ensemble.AdaBoostClassifier(n_estimators=20, random_state=random_state) predict_y = model.fit(train_X, train_failed_y).predict(test_X) #只看挂科这一类分类效果好 坏 predict_failed_y = [1 - x for x in predict_y] print("------Adaboost---------") f1 = f1_score(test_failed_y, predict_failed_y) print("f1:%f" % f1) macro_auc = roc_auc_score(test_failed_y, predict_failed_y, average="macro") print("macro auc:%f" % macro_auc) accuracy = accuracy_score(test_failed_y, predict_failed_y) print("accuracy:%f" % accuracy) precision = precision_score(test_failed_y, predict_failed_y) print("precision:%f" % precision) recall = recall_score(test_failed_y, predict_failed_y) print("recall:%f" % recall)
def InnerHoldout(): a = input('Click and drag FEATURE SELECTED SINGLE FOLD DATA file here: ') a = a.strip('\' ') data = pd.read_csv(a, encoding='utf-8').set_index('PATIENT') b = input('Click and drag LABELS file here: ') b = b.strip('\' ') labels = pd.read_csv(b, encoding='utf-8').set_index('PATIENT') c = input('Click and drag OUTER CV file here: ') c = c.strip('\' ') with open(c, 'rb') as f: inner_cv = pickle.load(f) thisfold = int(input('Which fold is this? ')) nfeatsmax = len(data.columns) nfeatsneural = round((nfeatsmax * 2 / 3)) rf = ensemble.RandomForestClassifier(max_features=nfeatsmax, max_depth=5, bootstrap=False) et = ensemble.ExtraTreesClassifier(max_features=nfeatsmax, max_depth=5, bootstrap=False) kn = neighbors.KNeighborsClassifier(n_neighbors=nfeatsmax, p=1) nb = naive_bayes.GaussianNB() dt = tree.DecisionTreeClassifier(max_features=nfeatsmax, max_depth=5, criterion='entropy') ls = svm.LinearSVC(penalty='l1', dual=False) gb = ensemble.GradientBoostingClassifier(loss='exponential', max_depth=2) nn = neural_network.MLPClassifier(hidden_layer_sizes=(nfeatsneural, nfeatsneural, nfeatsneural), learning_rate_init=0.0001, max_iter=500) ab = ensemble.AdaBoostClassifier() bc = ensemble.BaggingClassifier(base_estimator=rf) vc = ensemble.VotingClassifier(estimators=[('ab', ab), ('gb', gb), ('bc', bc)], voting='soft') estimators = { #'randomforest': rf, #'extratrees': et, #'kneighbors': kn, #'naivebayes': nb, #'decisiontree': dt, 'linearsvc': ls, #'gboost': gb, #'neuralnet': nn, #'adaboost': ab, #'bagging': bc, #'voting': vc } train_results = { 'fold': [], 'estimator': [], 'subjects': [], 'labels': [], 'predictions': [], 'scores': [], 'attempts': [] } test_results = { 'fold': [], 'estimator': [], 'subjects': [], 'labels': [], 'predictions': [], 'scores': [], 'attempts': [] } train_ids = pd.DataFrame(index=inner_cv['train'][thisfold - 1]) X_train = train_ids.join(data) y_train_df = train_ids.join(labels) y_train = np.array(y_train_df[y_train_df.columns[0]]) test_ids = pd.DataFrame(index=inner_cv['test'][thisfold - 1]) X_test = test_ids.join(data) y_test_df = test_ids.join(labels) y_test = np.array(y_test_df[y_test_df.columns[0]]) for j, k in zip(estimators.keys(), estimators.values()): k.fit(X_train, y_train) predict_train = k.predict(X_train) train_scores = [ 1 if x == y else 0 for x, y in zip(y_train, predict_train) ] train_results['fold'].extend([thisfold] * len(X_train)) train_results['estimator'].extend([j] * len(X_train)) train_results['subjects'].extend(train_ids.index) train_results['labels'].extend(y_train) train_results['predictions'].extend(predict_train) train_results['scores'].extend(train_scores) train_results['attempts'].extend([1] * len(X_train)) predict_test = k.predict(X_test) test_scores = [ 1 if x == y else 0 for x, y in zip(y_test, predict_test) ] test_results['fold'].extend([thisfold] * len(X_test)) test_results['estimator'].extend([j] * len(X_test)) test_results['subjects'].extend(test_ids.index) test_results['labels'].extend(y_test) test_results['predictions'].extend(predict_test) test_results['scores'].extend(test_scores) test_results['attempts'].extend([1] * len(X_test)) train_df = pd.DataFrame.from_dict(train_results).set_index('subjects') test_df = pd.DataFrame.from_dict(test_results).set_index('subjects') train_df.to_csv( path_or_buf= '/media/james/ext4data/current/projects/pfizer/combined-study/inner_holdout_train_results_fold_' + str(thisfold) + '.csv') test_df.to_csv( path_or_buf= '/media/james/ext4data/current/projects/pfizer/combined-study/inner_holdout_test_results_fold_' + str(thisfold) + '.csv') with open( '/media/james/ext4data/current/projects/pfizer/combined-study/trainedclassifier_innerfold_' + str(thisfold) + '.pickle', 'wb') as f: pickle.dump(k, f, pickle.HIGHEST_PROTOCOL) print('D_-j RESULT') trd = train_df.groupby('estimator').sum() trsum = (trd['scores'] / trd['attempts']) * 100 print(trsum) trmax = trsum.idxmax(axis=1) print('\nBest train: {}\n'.format(trmax)) print('D_j (holdout for estimating model quality) RESULT') ted = test_df.groupby('estimator').sum() tesum = (ted['scores'] / ted['attempts']) * 100 print(tesum) temax = tesum.idxmax(axis=1) print('\nBest test: {}\n'.format(temax)) return
def get_skl_estimator(self, **default_parameters): return ensemble.AdaBoostClassifier(**default_parameters)
def train_l1_models(): chunk_size = 30000000 num_of_chunks = 3 models = [] train_raw = pd.read_csv(path + "train.csv", nrows=2, dtype=init_dtype) starting_columns = train_raw.columns val = pd.read_csv(path + "train.csv", skiprows=chunk_size*num_of_chunks, nrows=chunk_size, dtype=init_dtype) val.columns = starting_columns val = preproccess_df(val) y_val = val['is_attributed'] val.drop(['is_attributed', 'attributed_time'], axis=1, inplace=True) for i in range(num_of_chunks): train_raw = pd.read_csv(path + "train.csv", nrows=chunk_size, skiprows=i*chunk_size, dtype=init_dtype) train_raw.columns = starting_columns print('[{0}] Finished to load data'.format(time.time() - start_time)) train = preproccess_df(train_raw) y_train = train['is_attributed'] train.drop(['is_attributed', 'attributed_time'], axis=1, inplace=True) # print('[{}] Start LGBM Training'.format(time.time() - start_time)) # dtrain = lgb.Dataset(train, label=y_train) # dval = lgb.Dataset(val, label=y_val, reference=dtrain) # lgbm_model1 = lgb.train(params, dtrain, num_boost_round=MAX_ROUNDS, valid_sets=[dtrain, dval], # early_stopping_rounds=50, verbose_eval=10) # # print('[{0}] Finish LGBM Training, {1}'.format(time.time() - start_time, 1)) # with open(path + 'l1/light_gbm1_{0}.plk'.format(i), 'wb') as infile: # pickle.dump(lgbm_model1, infile) # del lgbm_model1 x3 = train.as_matrix() y3 = np.expand_dims(y_train.as_matrix(), 1) x4 = val.as_matrix() y4 = np.expand_dims(y_val.as_matrix(), 1) y3 = keras.utils.to_categorical(y3, 2) y4 = keras.utils.to_categorical(y4, 2) print(x3.shape, y3.shape) nn_model = get_nn(x3) nn_model.fit(x3, y3, epochs=5, class_weight=class_weight, verbose=0, batch_size=20000) print('nn trained:', nn_model.evaluate(x4,y4, verbose=0)) nn_model.save(path + 'l1/model_nn_{0}.h5'.format(i)) del nn_model gb = ensemble.GradientBoostingClassifier() gb.fit(train,y_train) print('gb', gb.score(val, y_val)) with open(path + 'l1/gb_{0}.plk'.format(i), 'wb') as infile: pickle.dump(gb, infile) del gb ada = ensemble.AdaBoostClassifier() ada.fit(train, y_train) print('ada', ada.score(val, y_val)) with open(path + 'l1/ada_{0}.plk'.format(i), 'wb') as infile: pickle.dump(ada, infile) del ada rf = ensemble.RandomForestClassifier(class_weight=class_weight,n_jobs=-1) rf.fit(train, y_train) print('rf', rf.score(val, y_val)) with open(path + 'l1/rf_{0}.plk'.format(i), 'wb') as infile: pickle.dump(rf, infile) del rf et = ensemble.ExtraTreesClassifier(class_weight=class_weight,n_jobs=-1) et.fit(train, y_train) print('et', et.score(val, y_val, )) with open(path + 'l1/et_{0}.plk'.format(i), 'wb') as infile: pickle.dump(et, infile) del et # k = KNeighborsClassifier(n_jobs=-1) # k.fit(train, y_train) # print('k', k.score(val, y_val)) # with open(path + 'l1/k_{0}.plk'.format(i), 'wb') as infile: # pickle.dump(k, infile) # # del k # # r = RadiusNeighborsClassifier(n_jobs=-1) # r.fit(train, y_train) # print('r', r.score(val, y_val)) # with open(path + 'l1/r_{0}.plk'.format(i), 'wb') as infile: # pickle.dump(r, infile) # # del k # svc = SVC(class_weight=class_weight) # svc.fit(train, y_train) # print('k', svc.score(val, y_val)) # with open(path + 'l1/svc.plk', 'wb') as infile: # pickle.dump(svc, infile) return chunk_size*(num_of_chunks + 1)
def errorCorrectionTrain(input_images, output, parameters=None, debug=False, partition=None, part=None, multilabel=1): try: use_coord = parameters.get('use_coord', True) use_joint = parameters.get('use_joint', True) patch_size = parameters.get('patch_size', 1) border = patch_size * 2 if patch_size == 0: border = 2 normalize_input = parameters.get('normalize_input', True) method = parameters.get('method', 'lSVC') method2 = parameters.get('method2', method) method_n = parameters.get('method_n', 15) method2_n = parameters.get('method2_n', method_n) method_random = parameters.get('method_random', None) method_max_features = parameters.get('method_max_features', 'auto') method_n_jobs = parameters.get('method_n_jobs', 1) primary_features = parameters.get('primary_features', 1) training_images = [] training_diff = [] training_images_direct = [] training_direct = [] if debug: print("errorCorrectionTrain use_coord={} use_joint={} patch_size={} normalize_input={} method={} output={} partition={} part={}".\ format(repr(use_coord),repr(use_joint),repr(patch_size),repr(normalize_input),method,output,partition,part)) coords = None total_mask_size = 0 total_diff_mask_size = 0 for (i, inp) in enumerate(input_images): mask = None diff = None mask_diff = None if inp[-2] is not None: mask = extract_part( minc.Label(inp[-2]).data, partition, part, border) ground_data = minc.Label(inp[-1]).data auto_data = minc.Label(inp[-3]).data ground_shape = ground_data.shape ground = extract_part(ground_data, partition, part, border) auto = extract_part(auto_data, partition, part, border) shape = ground_shape if coords is None and use_coord: c = np.mgrid[0:shape[0], 0:shape[1], 0:shape[2]] coords = [ extract_part((c[j] - shape[j] / 2.0) / (shape[j] / 2.0), partition, part, border) for j in range(3) ] features = [ extract_part( minc.Image(k, dtype=np.float32).data, partition, part, border) for k in inp[0:-3] ] mask_size = shape[0] * shape[1] * shape[2] if debug: print("Training data size:{}".format(len(features))) if mask is not None: mask_size = np.sum(mask) print("Mask size:{}".format(mask_size)) else: print("Mask absent") total_mask_size += mask_size if multilabel > 1: diff = (ground != auto) total_diff_mask_size += np.sum(mask) if mask is not None: mask_diff = diff & (mask > 0) print("Sample {} mask_diff={} diff={}".format( i, np.sum(mask_diff), np.sum(diff))) #print(mask_diff) training_diff.append(diff[mask > 0]) training_direct.append(ground[mask_diff]) else: mask_diff = diff training_diff.append(diff) training_direct.append(ground[diff]) training_images.append( prepare_features(features, coords, mask=mask, use_coord=use_coord, use_joint=use_joint, patch_size=patch_size, primary_features=primary_features)) training_images_direct.append( prepare_features(features, coords, mask=mask_diff, use_coord=use_coord, use_joint=use_joint, patch_size=patch_size, primary_features=primary_features)) else: mask_diff = mask if mask is not None: training_diff.append(ground[mask > 0]) else: training_diff.append(ground) training_images.append( prepare_features(features, coords, mask=mask, use_coord=use_coord, use_joint=use_joint, patch_size=patch_size, primary_features=primary_features)) if debug: print("feature size:{}".format(len(training_images[-1]))) if i == 0 and parameters.get('dump', False): print("Dumping feature images...") for (j, k) in enumerate(training_images[-1]): test = np.zeros_like(images[0]) test[mask > 0] = k out = minc.Image(data=test) out.save(name="dump_{}.mnc".format(j), imitate=inp[0]) # calculate normalization coeffecients if debug: print("Done") clf = None clf2 = None if total_mask_size > 0: training_X = convert_image_list(training_images) training_Y = np.ravel( np.concatenate(tuple(j for j in training_diff))) if debug: print("Fitting 1st...") if method == "xgb": clf = None elif method == "SVM": clf = svm.SVC() elif method == "nuSVM": clf = svm.NuSVC() elif method == 'NC': clf = neighbors.NearestCentroid() elif method == 'NN': clf = neighbors.KNeighborsClassifier(method_n) elif method == 'RanForest': clf = ensemble.RandomForestClassifier( n_estimators=method_n, n_jobs=method_n_jobs, max_features=method_max_features, random_state=method_random) elif method == 'AdaBoost': clf = ensemble.AdaBoostClassifier(n_estimators=method_n, random_state=method_random) elif method == 'AdaBoostPP': clf = Pipeline(steps=[('normalizer', Normalizer()), ('AdaBoost', ensemble.AdaBoostClassifier( n_estimators=method_n, random_state=method_random))]) elif method == 'tree': clf = tree.DecisionTreeClassifier(random_state=method_random) elif method == 'ExtraTrees': clf = ensemble.ExtraTreesClassifier( n_estimators=method_n, max_features=method_max_features, n_jobs=method_n_jobs, random_state=method_random) elif method == 'Bagging': clf = ensemble.BaggingClassifier( n_estimators=method_n, max_features=method_max_features, n_jobs=method_n_jobs, random_state=method_random) elif method == 'dumb': clf = dummy.DummyClassifier(strategy="constant", constant=0) else: clf = svm.LinearSVC() #scores = cross_validation.cross_val_score(clf, training_X, training_Y) #print scores if method == "xgb": xg_train = xgb.DMatrix(training_X, label=training_Y) param = {} num_round = 100 # use softmax multi-class classification param['objective'] = 'multi:softmax' # scale weight of positive examples param['eta'] = 0.1 param['max_depth'] = 8 param['silent'] = 1 param['nthread'] = 4 param['num_class'] = 2 clf = xgb.train(param, xg_train, num_round) elif method != 'dumb': clf.fit(training_X, training_Y) if multilabel > 1 and method != 'dumb': if debug: print("Fitting direct...") training_X = convert_image_list(training_images_direct) training_Y = np.ravel( np.concatenate(tuple(j for j in training_direct))) if method2 == "xgb": clf2 = None if method2 == "SVM": clf2 = svm.SVC() elif method2 == "nuSVM": clf2 = svm.NuSVC() elif method2 == 'NC': clf2 = neighbors.NearestCentroid() elif method2 == 'NN': clf2 = neighbors.KNeighborsClassifier(method_n) elif method2 == 'RanForest': clf2 = ensemble.RandomForestClassifier( n_estimators=method_n, n_jobs=method_n_jobs, max_features=method_max_features, random_state=method_random) elif method2 == 'AdaBoost': clf2 = ensemble.AdaBoostClassifier( n_estimators=method_n, random_state=method_random) elif method2 == 'AdaBoostPP': clf2 = Pipeline(steps=[('normalizer', Normalizer()), ('AdaBoost', ensemble.AdaBoostClassifier( n_estimators=method_n, random_state=method_random))]) elif method2 == 'tree': clf2 = tree.DecisionTreeClassifier( random_state=method_random) elif method2 == 'ExtraTrees': clf2 = ensemble.ExtraTreesClassifier( n_estimators=method_n, max_features=method_max_features, n_jobs=method_n_jobs, random_state=method_random) elif method2 == 'Bagging': clf2 = ensemble.BaggingClassifier( n_estimators=method_n, max_features=method_max_features, n_jobs=method_n_jobs, random_state=method_random) elif method2 == 'dumb': clf2 = dummy.DummyClassifier(strategy="constant", constant=0) else: clf2 = svm.LinearSVC() if method2 == "xgb": xg_train = xgb.DMatrix(training_X, label=training_Y) param = {} num_round = 100 # use softmax multi-class classification param['objective'] = 'multi:softmax' # scale weight of positive examples param['eta'] = 0.1 param['max_depth'] = 8 param['silent'] = 1 param['nthread'] = 4 param['num_class'] = multilabel clf2 = xgb.train(param, xg_train, num_round) elif method != 'dumb': clf2.fit(training_X, training_Y) #print(clf.score(training_X,training_Y)) if debug: print(clf) print(clf2) else: print("Warning : zero total mask size!, using null classifier") clf = dummy.DummyClassifier(strategy="constant", constant=0) if method == 'xgb' and method2 == 'xgb': #save clf.save_model(output) clf2.save_model(output + '_2') else: with open(output, 'wb') as f: cPickle.dump([clf, clf2], f, -1) except mincError as e: print("Exception in linear_registration:{}".format(str(e))) traceback.print_exc(file=sys.stdout) raise except: print("Exception in linear_registration:{}".format(sys.exc_info()[0])) traceback.print_exc(file=sys.stdout) raise
for f in range(nb_features): print("%d. feature %s (%f)" % (f + 1, data.columns[2 + indices[f]], extratrees.feature_importances_[indices[f]])) for f in sorted( np.argsort(extratrees.feature_importances_)[::-1][:nb_features]): features.append(data.columns[2 + f]) #Building the below Machine Learning model model = { "DecisionTree": tree.DecisionTreeClassifier(max_depth=10), "RandomForest": ske.RandomForestClassifier(n_estimators=50), "GradientBoosting": ske.GradientBoostingClassifier(n_estimators=50), "AdaBoost": ske.AdaBoostClassifier(n_estimators=100), "GNB": GaussianNB() } #Training each of the model with the X_train and testing with X_test. The model with best accuracy will be ranked as winner results = {} print("\nNow testing model") for algo in model: clf = model[algo] clf.fit(X_train, y_train) score = clf.score(X_test, y_test) print("%s : %f %%" % (algo, score * 100)) results[algo] = score winner = max(results, key=results.get)
# There's a high spike in survival from females with small family size, while generally people with large family size had a more difficult time surviving. # In[17]: X = train.drop('Survived', axis=1) y = train.Survived # I chose to run a voting classifier model, with using multiple models and voting on if each passenger survived or died in the predictions. # # Source: https://www.kaggle.com/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy # In[18]: voting_estimates = [ ('ada', ensemble.AdaBoostClassifier(n_estimators=200)), ('bc', ensemble.BaggingClassifier(n_estimators=200)), ('etc', ensemble.ExtraTreesClassifier(n_estimators=200)), ('gbc1', ensemble.GradientBoostingClassifier(n_estimators=200)), ('gbc2', ensemble.GradientBoostingClassifier(n_estimators=500)), ('rfc', ensemble.RandomForestClassifier(n_estimators=200)), ('gpc', gaussian_process.GaussianProcessClassifier()), ('lr', linear_model.LogisticRegressionCV()), ('bnb', naive_bayes.BernoulliNB()), ('gnb', naive_bayes.GaussianNB()), ('knn5', neighbors.KNeighborsClassifier()), ('svc', svm.SVC(probability=True)) ] # In[19]: vote_soft = ensemble.VotingClassifier(estimators=voting_estimates,
pred = decision_stump.predict(data) Pred += pred * Alpha[i] return Pred if __name__ == '__main__': sample = load_iris() data = sample.data target = sample.target data1 = data[target == 0, :] target1 = target[target == 0] target1 = target1 - 1 data2 = data[target == 1, :] target2 = target[target == 1] data = numpy.concatenate((data1, data2), axis=0) target = numpy.concatenate((target1, target2), axis=0) data_train, data_test, target_train, target_test = train_test_split( data, target) clf = AdaBoostClassifier(no_of_stages=20) clf.fit(data_train, target_train) pred = clf.predict(data_test) pred = pred > 0 clf1 = ensemble.AdaBoostClassifier(n_estimators=20, algorithm='SAMME') clf1.fit(data_train, target_train) pred1 = clf1.predict(data_test) pred1 = pred > 0
def adaboost(self, X, y, valid, test): # No weights associated with classes for the boosting model # Run boosting model boost = ensemble.AdaBoostClassifier(n_estimators=500) start = time.time() clf = boost.fit(X, y) end = time.time() # TRAIN DATA # y_score = clf.predict_proba(X)[:, 1] # results = clf.predict(X) # # # Get metrics # mets = self.compute_metrics(y, results, y_score) # # print('AUROC:', mets['auroc']) # print('Accuracy:', mets['accuracy']) # print('Precision:', mets['precision']) # print('Recall:', mets['recall']) # print('F Score:', mets['f']) # print('Average Precision', mets['ap']) # print(mets['confusion']) # VALID DATA # y_score = clf.predict_proba(valid.drop("Class", axis=1).drop("Time", axis=1))[:, 1] # results = clf.predict(valid.drop("Class", axis=1).drop("Time", axis=1)) # # # Get metrics # mets = self.compute_metrics(valid["Class"], results, y_score) # # print('AUROC:', mets['auroc']) # print('Accuracy:', mets['accuracy']) # print('Precision:', mets['precision']) # print('Recall:', mets['recall']) # print('F Score:', mets['f']) # print('Average Precision', mets['ap']) # print(mets['confusion']) # TEST DATA y_score = clf.predict_proba(test.drop("Class", axis=1).drop("Time", axis=1))[:, 1] results = clf.predict(test.drop("Class", axis=1).drop("Time", axis=1)) # Get metrics mets = self.compute_metrics(test["Class"], results, y_score) mets['time'] = end - start print('AUROC:', mets['auroc']) print('Accuracy:', mets['accuracy']) print('Precision:', mets['precision']) print('Recall:', mets['recall']) print('F Score:', mets['f']) print('Average Precision', mets['ap']) print(mets['confusion'], '\n') # Precision recall measure #self.plot_precision_recall(test["Class"], y_score, 'Boosting') # Plot ROC #self.plotROC(mets['fpr'], mets['tpr'], mets['auroc'], 'Boosting') return mets