def test_svm_model(kernel, training_examples, training_labels, C=1.0, gamma='auto', n_estimators=10): model = ensemble.BaggingClassifier(svm.SVC(kernel=kernel, gamma=gamma, random_state=RAND_SEED, probability=True), n_estimators=n_estimators, max_samples=0.632) model.fit(training_examples, training_labels) test_set, test_labels, test_idxs = make_test_set(training_examples, training_labels) test_score = model.score(test_set, test_labels) get_true_false_positive_negative(model.predict(test_set), test_labels) return model, test_score
def ModelSelection(test_data, features, label): MLA = [ ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(), gaussian_process.GaussianProcessClassifier(), linear_model.LogisticRegressionCV(), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), naive_bayes.BernoulliNB(), naive_bayes.GaussianNB(), neighbors.KNeighborsClassifier(), svm.SVC(probability=True), svm.NuSVC(probability=True), svm.LinearSVC(), tree.DecisionTreeClassifier(), tree.ExtraTreeClassifier(), discriminant_analysis.LinearDiscriminantAnalysis(), discriminant_analysis.QuadraticDiscriminantAnalysis(), ] MLA_columns = ['MLA Name', 'MLA Parameters', 'MLA Score'] MLA_compare = pd.DataFrame(columns=MLA_columns) x_train, x_test, y_train, y_test = train_test_split(train_data[features], train_data[label], test_size=0.2) row_index = 0 MLA_predict = train_data[label] for alg in MLA: MLA_name = alg.__class__.__name__ MLA_compare.loc[row_index, 'MLA Name'] = MLA_name MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params()) alg.fit(x_train, y_train) MLA_predict[MLA_name] = alg.predict(x_test) MLA_compare.loc[row_index, 'MLA Score'] = alg.score(x_test, y_test) row_index += 1 MLA_compare.sort_values(by=['MLA Score'], ascending=False, inplace=True) return MLA_compare, x_train, x_test, y_train, y_test
def __init__(self, df, run_prefix): #code that will prepare the data y = df.PHENO X = df.drop(columns=['PHENO']) # Split the data X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.3, random_state=42) # 70:30 IDs_train = X_train.ID IDs_test = X_test.ID X_train = X_train.drop(columns=['ID']) X_test = X_test.drop(columns=['ID']) # Saving the prepped data the other classes will need self.df = df self.run_prefix = run_prefix self.X_train = X_train self.X_test = X_test self.y_train = y_train self.y_test = y_test self.IDs_train = IDs_train self.IDs_test = IDs_test # Where the results will be stored self.log_table = None self.best_algo = None self.algo = None self.rfe_df = None #The methods we will use self.algorithms = [ linear_model.LogisticRegression(solver='lbfgs'), ensemble.RandomForestClassifier(n_estimators=100), ensemble.AdaBoostClassifier(), ensemble.GradientBoostingClassifier(), linear_model.SGDClassifier(loss='modified_huber'), svm.SVC(probability=True, gamma='scale'), neural_network.MLPClassifier(), neighbors.KNeighborsClassifier(), discriminant_analysis.LinearDiscriminantAnalysis(), discriminant_analysis.QuadraticDiscriminantAnalysis(), ensemble.BaggingClassifier(), xgboost.XGBClassifier() ]
def all_classifiers(): # Model Data MLA = [ # Ensemble Methods ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(), # Gaussian Processes gaussian_process.GaussianProcessClassifier(), # GLM linear_model.LogisticRegressionCV(), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), # Navies Bayes naive_bayes.BernoulliNB(), naive_bayes.GaussianNB(), # Nearest Neighbor neighbors.KNeighborsClassifier(), # SVM svm.SVC(probability=True), svm.NuSVC(probability=True), svm.LinearSVC(), # Trees tree.DecisionTreeClassifier(), tree.ExtraTreeClassifier(), # Discriminant Analysis discriminant_analysis.LinearDiscriminantAnalysis(), discriminant_analysis.QuadraticDiscriminantAnalysis(), # xgboost: http://xgboost.readthedocs.io/en/latest/model.html XGBClassifier() ] return MLA
def __init__(self, df, run_prefix, max_iter, cv_count): self.run_prefix = run_prefix self.max_iter = max_iter self.cv_count = cv_count self.y_tune = df.PHENO self.IDs_tune = df.ID self.X_tune = df.drop(columns=['PHENO', 'ID']) best_algo_name_in = run_prefix + '.best_algorithm.txt' best_algo_df = pd.read_csv(best_algo_name_in, header=None, index_col=False) self.best_algo = str(best_algo_df.iloc[0, 0]) self.algorithms = [ linear_model.LogisticRegression(), ensemble.RandomForestClassifier(), ensemble.AdaBoostClassifier(), ensemble.GradientBoostingClassifier(), linear_model.SGDClassifier(loss='modified_huber'), svm.SVC(probability=True), neural_network.MLPClassifier(), neighbors.KNeighborsClassifier(), discriminant_analysis.LinearDiscriminantAnalysis(), discriminant_analysis.QuadraticDiscriminantAnalysis(), ensemble.BaggingClassifier(), xgboost.XGBClassifier() ] self.log_table = None self.best_algo_name_in = None self.best_algo_df = None self.hyperparameters = None self.scoring_metric = None self.cv_tuned = None self.cv_baseline = None self.algo = None self.searchCVResults = None self.rand_search = None self.algo_tuned = None self.tune_out = None
def testAllClassifiers(Xfile, yfile): X, Xtrain, Xtest, y, ytrain, ytest = loadAndSplitData(Xfile, yfile) clfs = [ linear_model.Perceptron(max_iter=1000), neighbors.KNeighborsClassifier(15, weights='uniform'), linear_model.LogisticRegression(), tree.DecisionTreeClassifier(), ensemble.BaggingClassifier(), ensemble.AdaBoostClassifier(), ensemble.RandomForestClassifier(), svm.LinearSVC() ] clfNames = [ "perceptron", "kNN, k=15", "logistic regression", "decision tree", "bagging", "boosting", "random forest", "support vector machines" ] for i, clf in enumerate(clfs): clf.fit(Xtrain, ytrain) print(clfNames[i] + " :", clf.score(Xtest, ytest))
def constructModel(corpus, classList, features, modelOutput): """ Trains a Decision Tree model on the test corpus. Args: corpus: A list of lists, containing the GC content, coverage, and class number. classList: A list of class names. features: List of variables used by each contig. modelOutput: Location to save model as GraphViz DOT, or False to save no model. Returns: classifier: A DecisionTreeClassifier object that has been trained on the test corpus. """ corpus.sort() # just in case X = [] Y = [] for item in corpus: X.append(item[:-1]) # all but the last item Y.append(item[-1]) # only the last item X_train, X_test, Y_train, Y_test = mscv.train_test_split(X, Y, test_size=0.3, random_state=0) # TODO: implement classifier testing and comparison, now only baggingClassifier is used as per paper #treeClassifier = tree.DecisionTreeClassifier() #treeClassifier = treeClassifier.fit(X_train, Y_train) #click.echo("Decision tree classifier built, score is %s out of 1.00" % treeClassifier.score(X_test, Y_test)) baggingClassifier = ensemble.BaggingClassifier() baggingClassifier = baggingClassifier.fit(X_train, Y_train) click.echo("Bagging classifier built, score is %s out of 1.00" % baggingClassifier.score(X_test, Y_test)) #forestClassifier = ensemble.RandomForestClassifier(n_estimators=10) #forestClassifier = forestClassifier.fit(X_train, Y_train) #click.echo("Random forest classifier built, score is %s out of 1.00" % forestClassifier.score(X_test, Y_test)) #adaClassifier = ensemble.AdaBoostClassifier(n_estimators=100) #adaClassifier = adaClassifier.fit(X_train, Y_train) #click.echo("AdaBoost classifier built, score is %s out of 1.00" % adaClassifier.score(X_test, Y_test)) #gradientClassifier = ensemble.GradientBoostingClassifier(n_estimators=100) #gradientClassifier = gradientClassifier.fit(X_train, Y_train) #click.echo("Gradient tree boosting classifier built, score is %s out of 1.00" % gradientClassifier.score(X_test, Y_test)) if modelOutput: with open(modelOutput, 'w') as dotfile: tree.export_graphviz(baggingClassifier, out_file=dotfile, feature_names=features, class_names=classList, filled=True, rounded=True, special_characters=True) return baggingClassifier
def Cross(X, Y, typ=0, n=5): if typ == 0: print('Starting SVM') classifier = make_pipeline( preprocessing.StandardScaler(), svm.LinearSVC()) elif typ == 1: print('Starting DTree') classifier = make_pipeline( preprocessing.StandardScaler(), tree.DecisionTreeClassifier()) elif typ == 2: print('Starting RForest') classifier = make_pipeline( preprocessing.StandardScaler(), ensemble.RandomForestClassifier()) elif typ == 3: print('Starting GaussianNB') classifier = make_pipeline( preprocessing.StandardScaler(), naive_bayes.GaussianNB()) elif typ == 4: print('Starting AdaBoost') classifier = make_pipeline( preprocessing.StandardScaler(), ensemble.AdaBoostClassifier()) elif typ == 5: print('Starting Bagging') classifier = make_pipeline( preprocessing.StandardScaler(), ensemble.BaggingClassifier()) elif typ == 6: print('Starting ExTree') classifier = make_pipeline( preprocessing.StandardScaler(), ensemble.ExtraTreesClassifier()) elif typ == 7: print('Starting GradBoost') classifier = make_pipeline( preprocessing.StandardScaler(), ensemble.GradientBoostingClassifier()) else: return scores = cross_val_score(classifier, X, Y, cv=n) # print scores print sum(scores)/n
def tryAllClassifers(Xtrain, Xtest, ytrain, ytest): # try with different classifiers _, axes = plt.subplots(3, 3, figsize=(14, 14)) models = [ neighbors.KNeighborsClassifier(n_neighbors=5), linear_model.LogisticRegression(), svm.SVC(), tree.DecisionTreeClassifier(), neural_network.MLPClassifier(), ensemble.BaggingClassifier(), ensemble.RandomForestClassifier(), ensemble.AdaBoostClassifier(), ensemble.GradientBoostingClassifier() ] for ax, model in zip(axes.flatten(), models): clf = model.fit(Xtrain, ytrain) # clf = linear_model.LogisticRegression().fit(Xtrain, ytrain) metrics.plot_confusion_matrix(clf, Xtest, ytest, ax=ax, values_format='d') f1 = metrics.f1_score(y_true=ytest, y_pred=clf.predict(Xtest)) ax.set(title=f'{type(clf).__name__}\nF1={f1:.2f}')
def Bagging(self): # model = ensemble.BaggingClassifier(svm.SVC(gamma=0.6, kernel='rbf',C=0.3)) model = ensemble.BaggingClassifier( KNeighborsClassifier(n_neighbors=14), n_estimators=200) # 3、模型训练 model.fit(self.x_train, self.y_train) # 4、模型预测 pred_y = model.predict(self.x_test) # 5、模型评估 score = round(metrics.accuracy_score(self.y_test, pred_y), 2) # 6、模型应用 pred = model.predict(self.pred)[0] if pred: pred = "-存活-" else: pred = "-死亡-" # 返回结果:str str1 = f"预测结果:{pred}" # 返回模型评估结果 str2 = f"Bag模型正确率:{score}" return str1, str2
if __name__ == "__main__": # --------------- Data preparation --------------- # train = scipy.io.loadmat('./TrainGaborized.mat') public = scipy.io.loadmat('./PublicGaborized.mat') hidden = scipy.io.loadmat('./HiddenGaborized.mat') labeled_images = scipy.io.loadmat('./labeled_images.mat') train_labels = labeled_images['tr_labels'] train_images = train['TrainImages'] public_images = public['PublicImages'] hidden_images = hidden['HiddenImages'] # ------------------------------------------------ # svc = SVC(C=100, cache_size=500, class_weight='auto', coef0=0.0, degree=8, gamma=1.0000000000000001e-04, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) engine = ensemble.BaggingClassifier(base_estimator=svc, n_estimators=50) engine.fit(train_images.T, train_labels.reshape(-1)) # save_model(engine) public_predictions = engine.predict(public_images.T) hidden_predictions = engine.predict(hidden_images.T) predictions = public_predictions for hidden_pred in hidden_predictions: predictions = np.append(predictions, hidden_pred) file_name = "solution2" create_csv(file_name, predictions) create_mat(file_name, predictions) # Perform cross validation print "Starting cross validation..." kfold = cross_validation.KFold(train_labels.shape[0], n_folds=8, shuffle=True) scores = cross_validation.cross_val_score(engine, train_images.T, train_labels.reshape(-1), n_jobs=-1, cv=kfold) print 'Cross validation performances: ', scores
annot_kws={'fontsize': 12}) plt.title('Pearson Correlation of Features', y=1.05, size=15) correlation_heatmap(data1) # # Step 5: Model Data # In[*] #Machine Learning Algorithm (MLA) Selection and initialization MLA = [ #Ensemble Methods ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(n_estimators=100), #Gaussian Processes gaussian_process.GaussianProcessClassifier(), #GLM linear_model.LogisticRegressionCV(), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), #Navies Bayes
def classifyAndTest(): global currentDataFile global trainedModel global featuresList global predictedLabels #Variables for Cost sensitive learning has_costs_columns_boolean = False costs = [] #Read in file panda dataframe try: numColumns = len(currentDataFile.columns) X_input = currentDataFile.iloc[:,0:(numColumns - 1)] y = currentDataFile.iloc[:,(numColumns - 1): (numColumns)] if 'Costs' in X_input.columns: has_costs_columns_boolean = True costs = X_input['Costs'].to_numpy() del X_input['Costs'] '''Deal with Categorical Features''' categoricals = X_input.select_dtypes(include=['object']) if (not categoricals.empty): ohe_categoricals = pd.get_dummies(X_input.select_dtypes(include=['object']).copy()) else: ohe_categoricals = categoricals X = pd.concat([X_input.select_dtypes(exclude=['object']), ohe_categoricals],axis = 1) featuresList = convert_dataframe_schema(X) X = X.to_numpy() y = y.to_numpy() except: modelTrainingResults.insert(tk.END,"ERROR: Unable to process file\n") return if(costSensitiveToggle.get()): if not (has_costs_columns_boolean): modelTrainingResults.insert(tk.END,"ERROR: No costs for cost-sensitive learning\n") return else: y_pandas = currentDataFile.iloc[:,(numColumns - 1): (numColumns)] weight_dict = construct_weight_vector_simple(costs,y_pandas,costSensitiveType.get()) clf = tree.DecisionTreeClassifier(class_weight = weight_dict) else: models = { 'SVM': svm.SVC(), 'Random Forest': ensemble.RandomForestClassifier(), 'Adaboost': ensemble.AdaBoostClassifier(), 'Bagging': ensemble.BaggingClassifier(), 'Gradient Boosting': GradientBoostingClassifier(loss = 'deviance', max_depth = 6, n_estimators = 100), 'Decision Tree': tree.DecisionTreeClassifier() } clf = models.get(modelChoice.get(),"Invalid choice of Model") '''Write Results ''' skf = StratifiedKFold(n_splits=crossVals.get(), shuffle = True) stratifiedAccuracy = 0.0 for train_indices, test_indices in skf.split(X, np.ravel(y)): clf_test = clf.fit(X[train_indices],np.ravel(y[train_indices])) y_pred = clf_test.predict(X[test_indices]) stratifiedAccuracy += accuracy_score(y[test_indices], y_pred) *100 start = time.time() trainedModel = clf.fit(X,np.ravel(y)) elapsed_time = (time.time() - start) predictedLabels = clf.predict(X) modelTrainingResults.insert(tk.END, "Results for " + str(dataFileName.get()) + " using " + str(modelChoice.get()) + "\n") modelTrainingResults.insert(tk.END, "Time to build model is " + str(elapsed_time) + " seconds\n" ) modelTrainingResults.insert(tk.END, "Accuracy when trained on all data is " + str(clf.score(X, y) * 100) + "%\n" ) modelTrainingResults.insert(tk.END, "Average accuracy over cross-validated sets is " + str(stratifiedAccuracy/crossVals.get()) + "%\n\n")
def getClassifier(data, target): score = 0 temp = 0 # Classifier to use in BaggingClassifier classifier1 = ensemble.ExtraTreesClassifier(min_samples_split=3, n_estimators=10, max_features=4) # Classifier for GridSearch classifier = ensemble.BaggingClassifier(classifier1) # Params param_grid = {'n_estimators': range(5, 25)} #param_grid = {'n_estimators' : np.linspace(10,11, num = 2)} # GridSearch grid_search = sklearn.grid_search.GridSearchCV( classifier, param_grid, scoring=sklearn.metrics.make_scorer(accuracy_score), cv=5, n_jobs=4) grid_search.fit(data, target) clf = grid_search.best_estimator_ # Print Estimator print(clf) # Print Cross of Validations Scores print(cross_val_score(clf, data, target, cv=5, scoring='accuracy')) # Print Mean of Cross Validations Scores temp = np.mean(cross_val_score(clf, data, target, cv=5, scoring='accuracy')) print("Built-in Cross-Validation: {} ".format(temp)) # Martins Version of Cross Validation chunk_size = len(data) / CVSize for x in range(CVSize): # These describe where to cut to get our crossdat first_step = x * chunk_size second_step = (x + 1) * chunk_size # Get the data parts we train on cross_data = np.vstack((data[:first_step], data[second_step:])) cross_target = np.append(target[:first_step], target[second_step:]) # fit and save the coef clf.fit(cross_data, cross_target) # Find mean squared error and print it sample_data = data[first_step:second_step] sample_target = target[first_step:second_step] # Get scores for our model pred = clf.predict(sample_data) RMSE = accuracy_score(sample_target, pred) score += RMSE score = score / CVSize print("Cross-Validation RMSE: {} ".format(score)) # Get global score #clf.fit(data, target) #pred = clf.predict(data) #RMSE = accuracy_score(target, pred) #print("RMSE on whole dataset {}".format(RMSE)) # Return estimator/classifier return clf
# genrate the accuracy metric metrics.accuracy_score(y_valid, y_pred) # 0.7425373134328358 #-- AdaBoosted Model --######################################################## # AdaBoost does not suppot knn #-- Bagging Model -############################################################ # intiate the base model tunebayes = naive_bayes.MultinomialNB(alpha=0, fit_prior=True) # initiate bag model bagbayes = ensemble.BaggingClassifier(base_estimator=tunebayes) # save the parametre features to tune as a dictionary params = { 'n_estimators': [10, 50, 100, 200, 400, 800], 'max_samples': [1.0, 0.9, 0.8, 0.7, 0.6], 'max_features': [1.0, 0.8, 0.6, 0.4, 0.2], 'random_state': [123] } # initate the tuning procedure, optimise on accuracy tunebagbayes = model_selection.GridSearchCV(estimator=bagbayes, param_grid=params, scoring='accuracy') # tune the model
}, 'model': ensemble.AdaBoostClassifier() } bagging = { 'features': { 'base_estimator': [tree.DecisionTreeClassifier(max_depth=8, random_state=random_seed)], 'max_samples': [1, 0.75, 0.5, 0.25], 'max_features': [1, 0.75, 0.5], 'n_estimators': [20], 'bootstrap': [True, False], 'bootstrap_features': [True, False], 'random_state': [random_seed] }, 'model': ensemble.BaggingClassifier() } logit = { 'features': { 'C': [0.001, 0.01, 0.1, 1, 100, 1000, 10000], 'solver': ['newton-cg', 'lbfgs', 'liblinear'], 'class_weight': ['auto'], 'random_state': [random_seed] }, 'model': linear_model.LogisticRegression() } knn = { 'features': { 'n_neighbors': list(range(5, 25, 5)),
svm.SVC(kernel="rbf", C=1, probability=True), svm.SVC(kernel="rbf", C=0.1, probability=True), svm.SVC(kernel="rbf", C=0.025, probability=True), tree.DecisionTreeClassifier(), ensemble.RandomForestClassifier(n_estimators=200), # chosen ensemble.AdaBoostClassifier(n_estimators=100), ensemble.AdaBoostClassifier(n_estimators=100, algorithm='SAMME.R'), ensemble.AdaBoostClassifier(n_estimators=100, algorithm='SAMME.R', learning_rate=1.2), ensemble.AdaBoostClassifier(n_estimators=200), ensemble.AdaBoostClassifier(n_estimators=200, algorithm='SAMME.R'), ensemble.AdaBoostClassifier(n_estimators=200, algorithm='SAMME.R', learning_rate=1.2), ensemble.BaggingClassifier(n_estimators=10), ensemble.BaggingClassifier(n_estimators=10, bootstrap=False), ensemble.BaggingClassifier(n_estimators=20), ensemble.BaggingClassifier(n_estimators=20, bootstrap=False), ensemble.BaggingClassifier(n_estimators=50), ensemble.BaggingClassifier(n_estimators=50, bootstrap=False), ensemble.BaggingClassifier(n_estimators=100), ensemble.BaggingClassifier(n_estimators=100, bootstrap=False), naive_bayes.GaussianNB(), naive_bayes.GaussianNB(priors=None), neural_network.MLPClassifier() ] def print_full(x): pd.set_option('display.max_rows', len(x))
pass_ratio = rawstat.iloc[:, 1] / rawstat.iloc[:, 2] shot_ratio = rawstat.iloc[:, 3] / rawstat.iloc[:, 4] ratio.append(pass_ratio) ratio.append(shot_ratio) ratio = np.array(ratio) ratio = ratio.astype('float') x_min, x_max = ratio[0].min() - 0.05, ratio[0].max() + 0.05 y_min, y_max = ratio[1].min() - 0.05, ratio[1].max() + 0.05 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01)) boost_tree = ensemble.AdaBoostClassifier( tree.DecisionTreeClassifier(max_depth=3)).fit(ratio.T, category) bag_tree = ensemble.BaggingClassifier( tree.DecisionTreeClassifier(max_depth=3)).fit(ratio.T, category) plt.figure(1) fig, axarr = plt.subplots(1, 2) for i in [0, 1]: decision_tree = tree.DecisionTreeClassifier(max_depth=i + 4).fit( ratio.T, category) tree_result = decision_tree.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape) axarr[i].pcolormesh(xx, yy, tree_result, cmap=plt.cm.Paired) axarr[i].scatter(pass_ratio[category == 0], shot_ratio[category == 0], c='r', marker='o') axarr[i].scatter(pass_ratio[category == 1],
def get_skl_estimator(self, **default_parameters): return ensemble.BaggingClassifier(**default_parameters)
" and num_trees = " + str(tup[1])) h_ens = boosting(Xtrn, ytrn, max_depth=tup[0], num_stumps=tup[1]) y_pred = [predict_ensemble_example(x, h_ens) for x in Xtst] confusion_matrix(y_pred, ytst) print() ######### Problem c - Scikit-learn #### Bagging l = [(3, 10), (3, 20), (5, 10), (5, 20)] print("-------Problem c - Scikit-learn bagging-------") for tup in l: print("Scikit-learn bagging with max_depth = " + str(tup[0]) + " and num_trees = " + str(tup[1])) cart = tree.DecisionTreeClassifier(max_depth=tup[0]) num_trees = tup[1] model = ensemble.BaggingClassifier(base_estimator=cart, n_estimators=num_trees) clf = model.fit(Xtrn, ytrn) y_pred = clf.predict(Xtst) confusion_matrix(y_pred, ytst) print() #### Boosting l = [(1, 20), (1, 40), (2, 20), (2, 40)] print("-------Problem c - Scikit-learn AdaBoost-------") for tup in l: print("Scikit-learn AdaBoost with max_depth = " + str(tup[0]) + " and num_stumps = " + str(tup[1])) cart = tree.DecisionTreeClassifier(max_depth=tup[0]) num_trees = tup[1] model = ensemble.AdaBoostClassifier(base_estimator=cart, n_estimators=num_trees)
train_Acc = [] test_Acc = [] ## Random Forest Classifier clf = ensemble.RandomForestClassifier(n_estimators=numBaseClassifiers) clf.fit(X_train, Y_train) Y_predict_train_EM = clf.predict(X_train) Y_predict_test_EM = clf.predict(X_test) train_Acc.append(accuracy_score(Y_train, Y_predict_train_EM)) test_Acc.append(accuracy_score(Y_test, Y_predict_test_EM)) print( "Ensemble Method by Random Forest Classifier give us train accuracy: %f and test accuracy: %f " % (accuracy_score(Y_train, Y_predict_train_EM), accuracy_score(Y_test, Y_predict_test_EM))) ## Bagging Classifier clf = ensemble.BaggingClassifier( DecisionTreeClassifier(max_depth=max_depth_EM), n_estimators=numBaseClassifiers) clf.fit(X_train, Y_train) Y_predict_train_EM = clf.predict(X_train) Y_predict_test_EM = clf.predict(X_test) train_Acc.append(accuracy_score(Y_train, Y_predict_train_EM)) test_Acc.append(accuracy_score(Y_test, Y_predict_test_EM)) print( "Ensemble Method by Bagging Classifier give us train accuracy: %f and test accuracy: %f " % (accuracy_score(Y_train, Y_predict_train_EM), accuracy_score(Y_test, Y_predict_test_EM))) ## Adaboost Classifier clf = ensemble.AdaBoostClassifier( DecisionTreeClassifier(max_depth=max_depth_EM), n_estimators=numBaseClassifiers) clf.fit(X_train, Y_train)
['PassengerId', 'Name', 'Age', 'Ticket', 'Cabin', 'Survived'], axis=1, inplace=False) titanic2.shape #Extract only train records 0:891 X_train = titanic2[0:titanic_train.shape[0]] X_train.shape X_train.info() y_train = titanic_train['Survived'] #oob scrore is computed as part of model construction process dt_estimator = tree.DecisionTreeClassifier() #This is what the real Bagging model is #In-order to specify, which model to be used is what base_estimator is: In this case we are building using Decission Tree Classifier bt_estimator = ensemble.BaggingClassifier(base_estimator=dt_estimator, random_state=2017) #n_estimators means how many no. of tree to be grown #base_estimator__ (Double underscore__ acts as prefix) bt_grid = {'n_estimators': [5, 6], 'base_estimator__max_depth': [3, 4, 5]} grid_bt_estimator = model_selection.GridSearchCV(bt_estimator, bt_grid, cv=10, n_jobs=5) grid_bt_estimator.fit(X_train, y_train) print(grid_bt_estimator.grid_scores_) #In SK Learn Verion 0.18 print(grid_bt_estimator.best_score_) print(grid_bt_estimator.best_params_) print(grid_bt_estimator.score(X_train, y_train))
count_vectorizer = feature_extraction.text.CountVectorizer() train_vectors = count_vectorizer.fit_transform(train["text"]) print(train_vectors) test_vectors = count_vectorizer.transform(test["text"]) """ clf = linear_model.RidgeClassifier() scores = model_selection.cross_val_score(clf, train_vectors, train["target"], cv=3, scoring="f1") print(scores) """ Methodes = [ #Ensemble Methods ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(), #Gaussian Processes #gaussian_process.GaussianProcessClassifier(), #GLM linear_model.LogisticRegressionCV(), linear_model.LogisticRegression(C=1000, random_state=0, solver='liblinear'), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), linear_model.Perceptron(),
import numpy as np from sklearn import preprocessing,neighbors,ensemble, decomposition, model_selection,ensemble import pandas as pd import pickle f = open('knn.pickle','wb') df = pd.read_csv('voice.csv') df.replace('?',-99999, inplace=True) X = np.array(df[['meanfun','Q25','sd','IQR','sfm','meanfreq','mode']]) y = np.array(df['label']) '''pca = decomposition.PCA() X = pca.fit_transform(X)''' gender_encoder = preprocessing.LabelEncoder() y = gender_encoder.fit_transform(y) scaler = preprocessing.StandardScaler() scaler.fit(X) X = scaler.transform(X) X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2,random_state=5) clf = ensemble.BaggingClassifier(neighbors.KNeighborsClassifier(),max_features=7) clf.fit(X_train, y_train) print(clf.score(X_test, y_test)) pickle.dump(clf,f) f.close()
from sklearn import neighbors from sklearn.grid_search import ParameterGrid from datetime import timedelta import matplotlib.pyplot as plt from scipy import optimize #dictionary of models and parameters - inspiration from #https://github.com/rayidghani/magicloops/blob/master/simpleloop.py MODELS = { 'decision_tree': tree.DecisionTreeClassifier(), 'logistic_regression': linear_model.LogisticRegression(), 'knn': neighbors.KNeighborsClassifier(), 'random_forest': ensemble.RandomForestClassifier(), 'support_vector_machine': svm.SVC(), 'boosting': ensemble.AdaBoostClassifier(), 'bagging': ensemble.BaggingClassifier() } PARAMS = { 'decision_tree': { 'max_depth': [1, 3, 5, 8, 20] }, 'logistic_regression': { 'C': [0.001, 0.01, 0.1, 1, 10] }, 'knn': { 'n_neighbors': [5, 10, 25] }, 'random_forest': { 'n_estimators': [1, 2, 3, 4, 10] },
""" 2. Bagging · 显然,Boosting策略受到数据集的影响较大, 为了获得更好的泛化能力,有必要使个体分类器尽可能独立 · Bagging策略: Step 1:对给定数据集进行采样,获得N个子数据集(可以有交集) Step 2:用N个子数据集训练N个个体分类器 Step 3:将所有个体分类器并行结合,对分类任务采用投票法预测 """ ''' 2.1 BaggingClassifier (实现基本的Bagging策略) ''' base = tree.DecisionTreeClassifier() model = ensemble.BaggingClassifier(base_estimator=base, n_estimators=10, random_state=1) kfold = model_selection.KFold(n_splits=20,random_state=1) result = model_selection.cross_val_score(model,X,y,cv=kfold) print(f'Accuracy of Bagging: {result.mean()*100:.2f}%') ''' 2.2 RandomForestClassifier 随机森林 (RF) 思想:在Bagging策略的基础上,构建决策树时,随机选择特征作为节点 ''' model = ensemble.RandomForestClassifier(n_estimators=30, random_state=1) kfold = model_selection.KFold(n_splits=10,random_state=1) result = model_selection.cross_val_score(model,X,y,cv=kfold) print(f'Accuracy of RF: {result.mean()*100:.2f}%')
test_accuracy = clf.score(test_array, test_label) print "--- Decision Tree Classifier ---" print "tree_depth", tree_depth print "train accuracy:", train_accuracy #print "validate accuracy:", validate_accuracy print "test accuracy:", test_accuracy print "" # Bagged Decision Tree from sklearn import ensemble tree_depth = 3 est = 15 clf = ensemble.BaggingClassifier( tree.DecisionTreeClassifier(max_depth=tree_depth), max_samples=1.0, max_features=1.0, n_estimators=est) clf = clf.fit(train_array, train_label) train_accuracy = clf.score(train_array, train_label) #validate_accuracy = clf.score(validate_array, validate_label) test_accuracy = clf.score(test_array, test_label) print "--- Bagging Tree Classifier ---" print "n_estimators:", est print "train accuracy:", train_accuracy #print "validate accuracy:", validate_accuracy print "test accuracy:", test_accuracy print ""
# coding=utf-8 """Comparison of various classifiers acting alone and inside an bagging ensemble.""" from sklearn import datasets, model_selection, metrics, tree, ensemble if __name__ == "__main__": print("Loading data...") X, y = datasets.load_iris(return_X_y=True) X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y) print("Fitting classifiers...") t = tree.DecisionTreeClassifier() t.fit(X_train, y_train) e = ensemble.BaggingClassifier(tree.DecisionTreeClassifier(), n_estimators=35, max_features=0.5, max_samples=0.5) e.fit(X_train, y_train) print("Evaluating classifiers...") print("#" * 128) print("Decision tree:") print("Test:") print(metrics.classification_report(y_test, t.predict(X_test))) print(metrics.confusion_matrix(y_test, t.predict(X_test))) print("Training:") print(metrics.classification_report(y_train, t.predict(X_train))) print(metrics.confusion_matrix(y_train, t.predict(X_train))) print("#" * 128)
stkf = sms.StratifiedKFold(n_splits=5, random_state=1, shuffle=True) C_space = np.logspace(-3, 2, 6) for c in tqdm.tqdm(C_space): lr = slm.LogisticRegression(C=c, random_state=1) print( c, sms.cross_val_score(lr, R_train_2, y_train_2, scoring='accuracy', cv=stkf).mean()) # я не возлагаю много надежд на логит, поэтому сразу подтюним его ### ЛР lr = slm.LogisticRegression(C=0.1, random_state=1) bg_lr = se.BaggingClassifier(base_estimator=lr, n_estimators=100, random_state=1, n_jobs=1) params = { 'max_features': [3, 6, 12, 24, 48, 96, 192, 384], 'max_samples': [0.5, 0.75, 0.9] } rs_lr = sms.RandomizedSearchCV(estimator=bg_lr, n_jobs=2, cv=stkf, verbose=2, param_distributions=params, scoring='accuracy', n_iter=20, random_state=1) rs_lr.fit(R_train_2, y_train_2)
# Predict probability on test data y_pred_proba_ada = ada.predict_proba(X_test) # Accuracy metrics (log-loss) logloss = metrics.log_loss(y_test, y_pred_proba_ada) print('Log-loss: {:.6f}'.format(logloss)) # ### Bagging classifier # In[ ]: # Bagging classifier # base estimator is a decision tree if not stated otherhwise bagg = ensemble.BaggingClassifier(base_estimator=ensemble.ExtraTreesClassifier(n_estimators=50, criterion='entropy', max_depth=5), n_estimators=100, max_samples=0.6, max_features=0.8, oob_score=True, n_jobs=-1) # Fit bagg.fit(X_train, y_train) # In[ ]: # Predict probability on test data y_pred_proba_bagg = bagg.predict_proba(X_test) # Accuracy metrics (log-loss) logloss = metrics.log_loss(y_test, y_pred_proba_bagg) print('Log-loss: {:.6f}'.format(logloss)) # ### Stochastic Gradient Descent