def search(self): rois = [] for i in range(len(self.params['learning_rate'])): for j in range(len(self.params['n_estimators'])): clf = ABC(random_state=7, learning_rate=self.params['learning_rate'][i], n_estimators=self.params['n_estimators'][j]) clf.fit(self.X_train, self.y_train) predictions = [] for match in X_test: predictions.append(clf.predict([match])[0]) predictions = np.array(predictions) roi = self.roi_funcs[1](predictions) + self.roi_funcs[3]( predictions) rois.append((roi, self.params['learning_rate'][i], self.params['n_estimators'][j])) max_tuple = max(rois, key=lambda x: x[0]) self.best_params_ = { 'learning_rate': max_tuple[1], 'n_estimators': max_tuple[2] } return ABC(random_state=7, learning_rate=max_tuple[1], n_estimators=max_tuple[2])
def get_estimator(algoname, seed=0): ''' Returns an estimator object based on the string algoname Valid options are outlined in validate_algos(). Estimators that require a random seed (e.g. dt), should be passed a non-zero seed''' if(algoname == 'nb'): return GaussianNB() if(algoname == 'dt'): return tree.DecisionTreeClassifier(random_state=seed) if(algoname == 'dte'): return tree.DecisionTreeClassifier(random_state=seed, \ criterion="entropy") if(algoname == 'lr'): return LogisticRegression(penalty='l1', class_weight='auto', \ random_state=seed) if(algoname == 'rfc'): return RFC(criterion='entropy', random_state=seed) if(algoname == 'bac'): return BAC(random_state=seed) if(algoname == 'abc'): return ABC(random_state=seed) # Implementation of a linear SVM. Note: nonlinear will take more time, but # will likely have slightly higher performance. if(algoname == 'svm'): return svm.LinearSVC(random_state=seed) #return svm.SVC(random_state=seed) # You only get here if the string was invalid print("Unrecognized algorithm name") return
def NLMmodelexp1(): modelExperiment( nlmInsampleData, nlmOutsampleData, 'NLMdata/', fullFV, [LR(), DT(), KNC(), RF(), ABC(), GNB(), QDA()], [ 'LogisticRegression', 'DTree', 'KNN', 'RandomForest', 'AdaBoosted', 'GaussianNB', 'QuadraticDiscriminantAnalysis' ], 'NLMmodelExperiment1.csv', 'NLMclassifier_plot1.png', True)
def adacv(n_estimators, learning_rate, seed=seed): return cross_val_score(ABC(n_estimators=int(n_estimators), learning_rate=float(learning_rate), random_state=int(seed)), X, y, 'roc_auc', cv=10).mean()
def get_models(dataset): if dataset in ["mnist12", "mnist28"]: classifiers = [(DTC(max_depth=30, class_weight='balanced'), "Decision Tree (max_depth=30)"), (LRC(solver='lbfgs', n_jobs=2, multi_class="auto", class_weight='balanced', max_iter=50), "Logistic Regression"), (MLPC((100, ), max_iter=50), "MLP (100)")] return classifiers if dataset in ['adult']: classifiers = [(DTC(max_depth=15, class_weight='balanced'), "Decision Tree (max_depth=20)"), (ABC(), "Adaboost (estimator=50)"), (LRC(solver='lbfgs', n_jobs=2, class_weight='balanced', max_iter=50), "Logistic Regression"), (MLPC((50, ), max_iter=50), "MLP (50)")] return classifiers if dataset in ['census', 'credit']: classifiers = [ (DTC(max_depth=30, class_weight='balanced'), "Decision Tree (max_depth=30)"), (ABC(), "Adaboost (estimator=50)"), (MLPC((100, ), max_iter=50), "MLP (100)"), ] return classifiers if dataset in ['intrusion', 'covtype']: classifiers = [ (DTC(max_depth=30, class_weight='balanced'), "Decision Tree (max_depth=30)"), (MLPC((100, ), max_iter=50), "MLP (100)"), ] return classifiers if dataset in ['news']: regressors = [(LRR(), "Linear Regression"), (MLPR((100, ), max_iter=50), "MLP (100)")] return regressors assert 0
def SOmodelexp1(): modelExperiment( SOInsampleData, SOOutsampleData, 'stackoverflowdata/', fullFV, [LR(), DT(), KNC(), RF(n_estimators=200), ABC(), GNB(), QDA()], [ 'LogisticRegression', 'DTree', 'KNN', 'RandomForest', 'AdaBoosted', 'GaussianNB', 'QuadraticDiscriminantAnalysis' ], 'SOmodelExperiment1.csv', 'SOclassifier_plot1.png', True)
def main(): recipes = [] # list of all the recipes in the dataset cuisines = [] # list of all the cuisines in the dataset ingredients = set() # list of individual ingredients used in the recipes with open("finaldata.json") as file: data = json.load(file) for d in data: for key, value in d.items(): if key == "ingredients": recipes.append(value) for item in value: ingredients.add(item) elif key == "cuisine": cuisines.append(value) # splitting the initial dataset into training and testing datasets training_data, testing_data, training_cuisine, testing_cuisine = train_test_split( recipes, cuisines, test_size=0.2) # initializing a sparse matrix for the training data training_data_matrix = scipy.sparse.dok_matrix( (len(training_data), len(ingredients))) # changing the value to 1 in the training matrix for every ingredient occurs in a recipe for i, recipe in enumerate(training_data): for j, ingredient in enumerate(ingredients): if ingredient in recipe: training_data_matrix[i, j] = 1 pipeline = ABC(n_estimators=300) # using 300 classifiers pipeline.fit(training_data_matrix, training_cuisine ) # building a boosted classifier from the training set # initializing a sparse matrix for the testing data testing_data_matrix = scipy.sparse.dok_matrix( (len(testing_data), len(ingredients))) # changing the value to 1 in the testing matrix for every ingredient occurs in a recipe for i, recipe in enumerate(testing_data): for j, ingredient in enumerate(ingredients): if ingredient in recipe: testing_data_matrix[i, j] = 1 # returns the predicted outcome per sample which is computed as the weighted mean prediction of the classifiers in the ensemble result = pipeline.predict(testing_data_matrix) print(classification_report(testing_cuisine, result))
def singleExperiment(cfg): """ Can only run with access to config variable """ # make data X, y = make_classification(n_samples=cfg['n_samples'], n_features=cfg['n_features']) # train test split X_train, X_test, y_train, y_test = TTS(X, y, test_size=.2) # set up RecurrentForest model rec_fst_clf = RF.RecurrentForest(X_train, y_train, cfg['T'], cfg['n_trees'], cfg['p_connect'], cfg['p_feature'], cfg['p_example'], cfg['tree_kwargs']) # set up RandomForest rnd_fst_clf = RFC(**cfg['random_forest_kwargs']) # set up AdaBoost ada_bst_clf = ABC(**cfg['ada_boost_kwargs']) # in a list models = [rec_fst_clf, rnd_fst_clf, ada_bst_clf] print("<<< training models >>>") for m in tqdm(models): m.fit(X_train, y_train) # RecurrentForest ignores args - data present at init print("<<< testing models >>>") y_hats = np.zeros((3, X_test.shape[0])) for i, m in tqdm(enumerate(models)): if i == 0: y_hats[i, :] = m.predictNew(X_test) else: y_hats[i, :] = m.predict(X_test) # get metrics measures = np.zeros((3, 4)) for i in tqdm(range(3)): measures[i,:] = M.binary_metrics(y_test, y_hats[i,:], model=str(models[i])) return measures
def get_classifier(classifier, df_s, j=0, z=0): if classifier.upper() == 'LDA': CLSFR = LDA(solver='lsqr', shrinkage='auto') elif classifier.lower() == 'logistic_bal': CLSFR = LR(class_weight='balanced', random_state=5, max_iter=1e4, C=0.1**j, solver='newton-cg') elif classifier.lower() == 'logistic_unbal': CLSFR = LR(random_state=5, max_iter=1e4, C=0.1**j, solver='newton-cg') elif classifier.upper() == 'KNN': CLSFR = KNNc(n_neighbors=j) elif classifier.lower() == 'ridge_bal': CLSFR = RdC(alpha=j, class_weight='balanced', random_state=5) elif classifier.lower() == 'ridge_unbal': CLSFR = RdC(alpha=j, random_state=5) elif classifier.lower() == 'random_forest_bal': CLSFR = RFC(n_estimators=int(50 * j), random_state=5, min_samples_leaf=2, class_weight='balanced') elif classifier.lower() == 'random_forest_unbal': CLSFR = RFC(n_estimators=int(50 * j), random_state=5, min_samples_leaf=2) elif classifier.upper() == 'QDA': CLSFR = QDA(reg_param=j) elif classifier.lower() == 'svc': CLSFR = SVC(gamma='scale', random_state=5, probability=True, degree=j) elif classifier.lower() == 'abc': CLSFR = ABC(base_estimator=RFC(n_estimators=int(50 * j)), random_state=5) return CLSFR
def AdaBoostpredictor(X_train, y_train, X_test): ''' Input traning data ,target, and test data Output prabability of each label for test data''' from sklearn.ensemble import AdaBoostClassifier as ABC # Cross validation may not be needed for random forest classifier model = ABC(random_state=1) model.fit(X_train, y_train) y_pred = model.predict(X_train) accuracy = metrics.accuracy_score(y_train, y_pred) logLoss = metrics.log_loss(y_train, y_pred) y_pred = model.predict(X_test) modelName = model.__class__.__name__ accModels[modelName] = accuracy predictions[modelName] = y_pred return y_pred, accuracy
def abcScores(self,Xn,y,cv=5,param_name='n_estimators',paramRange=(1,10,1),trainW=1,testW=2,title='Adaboost classifier',clfArg={},plot=False): """ Perform the validation_curve function using Adaboost classifier (ABC) and get the best param value based on the highest test_score. cv indicates the cross validation k-fold. Default param to optimize is max_depth. paramRange=(a,b,c) is the range to evaluate the param_name. a start degree, b end degree, c step. After the function gets the best param value, associated test_score and train_score are used to calculated a weighted_score. trainW and testW are the weights used to calculated a weighted_score=test_score*testW+train_score*trainW)/(testW+trainW). clfArg is a dictionary to add any additional parameters to the ABC. To see how the best score is collected set plot=True. """ clf=ABC(**clfArg) model_scores=list() param_range=np.arange(paramRange[0],paramRange[1],paramRange[2]) train_sc, test_sc = validation_curve(clf,Xn,y,param_name=param_name,param_range=param_range,cv=cv) param_score=self.plotTrainTest(train_sc,test_sc,param_range,t=title,xlabel=param_name,plot=plot) scoreDic={'model':title,'param_name':param_name} scoreDic.update(param_score) model_scores.append(scoreDic.copy()) return self.scoreModelListDf(model_scores,trainW=trainW,testW=testW)
def make_feature_graph(self, feature_list, labels_filename="trainingSetLabels.dat"): ''' Function to plot 2 graphs: 1. Decision Boundaries: Takes atmost 2 features for every sample and plots decision boundaries defined by 5 classifiers: ['Logistic Regression', 'Random Forest', 'Naive Bayes', 'SVM', 'AdaBoost'] 2. Scatter Plot: Plots the values of each data point on a Scatter plot to visualise how separable they seem. This is not performed on any classifier. For manual evaluation only. Parametrs: feature_list: A list of lists containing the features for each sample. labels_filename: Path to the filename containing the labels for the training data ''' y = [] with open(labels_filename) as label_file: x_true_list = [] x_fake_list = [] for idx, label in enumerate(label_file): if int(label): y.append(1) x_true_list.append(feature_list[idx]) else: y.append(0) x_fake_list.append(feature_list[idx]) y = np.array(y) X_plot = feature_list #---------------------------- Decision Boundary Plot -----------------------# if len(feature_list[0]) == 1 or len(feature_list[0]) == 2: print "Now plotting Decision boundary Plot. (Works best for 2 features)" gs = gridspec.GridSpec(2, 2) fig = plt.figure(figsize=(10, 8)) clf1 = LogisticRegression(random_state=1) clf2 = RFC(n_estimators=100, random_state=1) clf3 = GNB() clf4 = SVC() clf5 = ABC() labels = [ 'Logistic Regression', 'Random Forest', 'Naive Bayes', 'SVM', 'AdaBoost' ] for clf, lab, grd in zip([clf1, clf2, clf3, clf4, clf5], labels, itertools.product([0, 1], repeat=2)): clf.fit(X_plot, y) ax = plt.subplot(gs[grd[0], grd[1]]) fig = plot_decision_regions(X=X_plot, y=y, clf=clf, legend=2) plt.title(lab) plt.show() #---------------------------- Individual Scatter Plot -----------------------# plot_idx = 0 if len(feature_list[0]) != 1: plot_idx = int( raw_input( "Your list has more than 1 feature. Which feature would you like to observe? (Insert Index): " )) print "Now plotting scatter plot of feature:" x_true = [feat[plot_idx] for feat in x_true_list] x_fake = [feat[plot_idx] for feat in x_fake_list] x_true = np.array(x_true) x_fake = np.array(x_fake) y_plot = np.arange(max(len(x_true), len(x_fake))) trace_true = go.Scatter(y=x_true, x=y_plot, mode='markers', text="True") trace_fake = go.Scatter(y=x_fake, x=y_plot, mode='markers', text="Fake") data = [trace_true, trace_fake] layout = go.Layout(showlegend=False) fig = go.Figure(data=data, layout=layout) plot_url = offline.plot(fig, filename='text-chart-basic')
def article_classifier(self): train_pos, dev_pos = self.pos_load_features() rare_ttr_perplexity_4gram_features = list( extractFourGram('featureFour.txt', 'basic.csv')) X_dev = list(extractFourGram('featureFour_dev.txt', 'basic_dev.csv')) y_dev = self.get_dev_labels() X = rare_ttr_perplexity_4gram_features y = self.labels X.append(train_pos) X_dev.append(dev_pos) X = np.array(X).T[:, :] X_dev = np.array(X_dev).T[:, :] # self.make_feature_graph(X[:,1:3],"trainingSetLabels.dat") lr_clf = LogisticRegression() lr_clf.fit(X, y) lr_predicted = lr_clf.predict(X_dev) lr_scores = cross_val_score(lr_clf, X, y, cv=5, n_jobs=5) print lr_scores, np.mean(lr_scores), np.std(lr_scores) # svm_predicted = cross_val_predict(lr_clf, X, y, cv=5) print accuracy_score(y_dev, lr_predicted) # SVM Parameters: # {'C': [0.1,1.0,10.0,100.0], 'gamma':[1.0,2.0,'auto',0.1,0.01,0.001], 'kernel':['rbf','linear']} svm_clf = SVC(probability=True) svm_clf.fit(X, y) svm_predicted = svm_clf.predict(X_dev) svm_scores = cross_val_score(svm_clf, X, y, cv=5, n_jobs=5) print svm_scores, np.mean(svm_scores), np.std(svm_scores) # svm_predicted = cross_val_predict(svm_clf, X, y, cv=5) print accuracy_score(y_dev, svm_predicted) # RandomForest Parameters: # {'n_estimators':[10,20,5,30],'criterion':['gini','entropy']} rf_clf = RFC() rf_clf.fit(X, y) rf_predicted = rf_clf.predict(X_dev) rf_scores = cross_val_score(rf_clf, X, y, cv=5, n_jobs=5) print rf_scores, np.mean(rf_scores), np.std(rf_scores) # rf_predicted = cross_val_predict(rf_clf, X, y, cv=5) print accuracy_score(y_dev, rf_predicted) # AdaBoost Parameters: # {'n_estimators':[10,20,5,30],'learning_rate':[1.0,0.1,0.01,0.001,0.05]} ab_clf = ABC() ab_clf.fit(X, y) ab_predicted = ab_clf.predict(X_dev) ab_scores = cross_val_score(ab_clf, X, y, cv=5, n_jobs=5) print ab_scores, np.mean(ab_scores), np.std(ab_scores) # ab_predicted = cross_val_predict(ab_clf, X, y, cv=5) print accuracy_score(y_dev, ab_predicted) # Gaussian NB Parameters: # {'n_estimators':[10,20,5,30],'learning_rate':[1.0,0.1,0.01,0.001,0.05]} nb_clf = GNB() nb_clf.fit(X, y) nb_predicted = nb_clf.predict(X_dev) nb_scores = cross_val_score(nb_clf, X, y, cv=5, n_jobs=5) print nb_scores, np.mean(nb_scores), np.std(nb_scores) # nb_predicted = cross_val_predict(nb_clf, X, y, cv=5) print accuracy_score(y_dev, nb_predicted)
if __name__ == "__main__": X, y = make_classification(n_samples=10000, n_features=10, n_informative=5, random_state=0, n_classes=2) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) from sklearn.ensemble import AdaBoostClassifier as ABC clf = ABC(DecisionTreeClassifier(max_depth=1), n_estimators=20, algorithm="SAMME") clf.fit(X_train, y_train) result = clf.predict(X_test) print("sklearn中SAMME的验证集得分为: ", accuracy_score(y_test, result)) clf = AdaboostClassifier(DecisionTreeClassifier, 20, "SAMME") clf.fit(X_train, y_train, max_depth=1) result = clf.predict(X_test) print("使用SAMME.R集成的验证集得分为: ", accuracy_score(y_test, result)) clf = ABC(DecisionTreeClassifier(max_depth=1), n_estimators=20, algorithm="SAMME.R") clf.fit(X_train, y_train) result = clf.predict(X_test)
results['acc_train'] = accuracy_score(y_train[:300], predictions_train) results['acc_test'] = accuracy_score(y_test, predictions_test) results['f_train'] = fbeta_score(y_train[:300], predictions_train, 0.5) results['f_test'] = fbeta_score(y_test, predictions_test, 0.5) # Success print "{} trained on {} samples.".format(learner.__class__.__name__, sample_size) # Return the results return results clf_A = ABC(random_state = 42) clf_B = DTC(random_state = 42) clf_C = LinearSVC(random_state = 42) samples_1 = len(X_train)/100 samples_10 = len(X_train)/10 samples_100 = len(X_train) # Collect results on the learners results = {} for clf in [clf_A, clf_B, clf_C]: clf_name = clf.__class__.__name__ results[clf_name] = {} for i, samples in enumerate([samples_1, samples_10, samples_100]):
def main(): fullFV = [ csAbstract, csSentence, jac, jacq3, dice, diceq3, cosM, cosMq3, LVdist, sw, nw, jw ] fullModels = [LR(), DT(), KNC(), RF(n_estimators=200), ABC(), GNB(), QDA()] fullModelNames = [ 'LogisticRegression', 'DTree', 'KNN', 'RandomForest', 'AdaBoosted', 'GaussianNB', 'QuadraticDiscriminantAnalysis' ] #modelExperiment(nlmInsampleData,nlmOutsampleData,'NLMdata/',fullFV,[LR(),DT(),KNC(),RF(),ABC(),GNB(),QDA()], # ['LogisticRegression','DTree','KNN','RandomForest','AdaBoosted','GaussianNB','QuadraticDiscriminantAnalysis'], # 'NLMmodelExperiment1.csv','NLMclassifier_plot1.png',True) def SOmodelexp1(): modelExperiment( SOInsampleData, SOOutsampleData, 'stackoverflowdata/', fullFV, [LR(), DT(), KNC(), RF(n_estimators=200), ABC(), GNB(), QDA()], [ 'LogisticRegression', 'DTree', 'KNN', 'RandomForest', 'AdaBoosted', 'GaussianNB', 'QuadraticDiscriminantAnalysis' ], 'SOmodelExperiment1.csv', 'SOclassifier_plot1.png', True) def SOmodelexp2(): modelExperiment( SOInsampleData, SOOutsampleData, 'stackoverflowdata/', fullFV, [LR(), RF(n_estimators=200), ABC()], ['LogisticRegression', 'RandomForest', 'AdaBoosted'], 'SOmodelExperiment2.csv', 'SOclassifier_plot2.png', True) def NLMmodelexp1(): modelExperiment( nlmInsampleData, nlmOutsampleData, 'NLMdata/', fullFV, [LR(), DT(), KNC(), RF(), ABC(), GNB(), QDA()], [ 'LogisticRegression', 'DTree', 'KNN', 'RandomForest', 'AdaBoosted', 'GaussianNB', 'QuadraticDiscriminantAnalysis' ], 'NLMmodelExperiment1.csv', 'NLMclassifier_plot1.png', True) #featureVectorExperiment(SOInsampleData,SOOutsampleData,'stackoverflowdata/',[[jacq3],[cosM],[cosMq3],[jacq3,cosM],[cosM,cosMq3],[csAbstract]],DT(), # 'DTree','SOFVExperiment1.csv','SOFV_plot1.png') def NLMexperiments(): j2 = myJoin.join(nlmInsampleData, nlmOutsampleData, 'NLMdata/') j2.setComponentList(fullFV) j2.loadCachedInsampleFV() results = [] for prop in np.arange(0.05, 0.25, 0.01): precision, recall, _, size = j2.classifyNIterations( subSampleProportion=prop) results.append([size, precision, recall]) writeToCSV('NLMdata/sizeTest2.csv', ['Size', 'Precision', 'Recall'], results) def SOexperiments(): j1 = myJoin.join(SOInsampleData, SOOutsampleData, 'stackoverflowdata/') j1.setComponentList(fullFV) j1.buildInsampleFV() j1.model = RF(n_estimators=200) j1.modelName = 'RF' def threshHoldTest(): singleThreshTest = j1.thresholdTest(np.arange(0.0, 1.01, 0.01)) writeToCSV('stackoverflowdata/simpleThresholdTest1.csv', ['Threshold', 'Precision', 'Recall'], singleThreshTest) print 'simple thing done' fiftyThreshTest = [ j1.thresholdTest(np.arange(0.0, 1.01, 0.01)) for i in range(50) ] mean_values = np.mean(fiftyThreshTest, axis=0) writeToCSV('stackoverflowdata/fiftyThresholdTest1.csv', ['Threshold', 'Precision', 'Recall'], mean_values) print 'fifty thing done' thresholdExperiment(SOInsampleData, SOOutsampleData, 'stackoverflowdata/', fullFV, RF(n_estimators=200), 'RF', 'thresholdExperiment1.csv') #SOexperiments() NLMexperiments()
from sklearn.svm import SVC model = SVC(random_state=42).fit(X_train, y_train) prediction = model.predict(X_test) score = accuracy_score(y_test, prediction) print(score) # ADA Boost Classifier # -------------------- # In[21]: from sklearn.ensemble import AdaBoostClassifier as ABC model = ABC(n_estimators=100, random_state=42, learning_rate=.80).fit(X_train, y_train) prediction = model.predict(X_test) score = accuracy_score(y_test, prediction) print(score) # Bagging Classifier # ---------------- # In[22]: from sklearn.ensemble import BaggingClassifier as BC model = BC(n_estimators=100, random_state=42).fit(X_train, y_train) prediction = model.predict(X_test) score = accuracy_score(y_test, prediction) print(score)
val['target'] = val.apply(lambda row: make_target(row), axis=1) val['target_num'] = val['target'].map(target_dict) val['is_EOH'] = val.apply(lambda row: end_of_half_det(row), axis=1) val['pos_leads'] = (val['posteam_score'] > val['defteam_score']).astype(int) to_drop = [ 'Unnamed: 0', 'game_date', 'game_id', 'ends_TD', 'ends_FG', 'ends_punt', 'ends_other', 'target', 'target_num' ] targets = ['ends_TD', 'ends_FG', 'ends_punt', 'ends_other'] features = [c for c in start.columns if c not in to_drop] train, test = tt_split(start) y_train = train['target_num'].values X_train = train[features].values abc = ABC(base_estimator=DTC(max_depth=2), n_estimators=500, learning_rate=0.25) abc.fit(X_train, y_train) y_test = test['target_num'].values X_test = test[features].values score = abc.score(X_test, y_test) print(f'Test: {score:0.3f}') X_val = val[features].values y_val = val['target_num'] val_score = abc.score(X_val, y_val) print(f'AB Validation: {val_score:0.3f}') rf = RFC(n_estimators=500, max_depth=40, bootstrap=False, max_features=5,
delimiter="|", skip_header=1) X = training_data[:, :1000] Y = training_data[:, 1000] # Various Classifiers dtc_min_samples_leaf = DTC(min_samples_leaf=15) etc = ETC() gbc = GBC() rfc = RFC() dtc_max_depth = DTC(max_depth=8) nb = BernoulliNB() svc = SVC() lr = LR() abc = ABC() bc = BC() ''' inv_doc_freq = np.zeros(1000) for i in range(len(inv_doc_freq)): total = sum(X[:, i]) if total == 0: inv_doc_freq[i] = 0 else: inv_doc_freq[i] = math.log(N / sum(X[:, i])) ''' # Data normalization for i in range(len(X)): max_freq = max(X[i]) if max_freq == 0:
def abc(train_examples, train_labels, test_examples, test_labels, verbose): model = ABC(n_estimators=500) model.fit(train_examples, train_labels) score = model.score(test_examples, test_labels) print("CONVERGENCE: ", model.score(train_examples, train_labels)) return score
plt.scatter(bumpy_fast, grade_fast, color="b", label="fast") plt.scatter(grade_slow, bumpy_slow, color="r", label="slow") plt.legend() plt.xlabel("bumpiness") plt.ylabel("grade") # plt.show() ################################################################################ # your code here! name your classifier object clf if you want the # visualization code (prettyPicture) to show you the decision boundary from sklearn.ensemble import RandomForestClassifier as RFC from sklearn.ensemble import AdaBoostClassifier as ABC from sklearn.neighbors import KNeighborsClassifier as KNC from sklearn.tree import DecisionTreeClassifier as DTC clf_list = [RFC(), ABC(), KNC()] acc_list = [] # RandomForestClassifier # n_estimators ~10 # criterion = 'entropy'/ 'gini' # max_features ~0.4 # Max acc: 0.94 # KNeighborsClassifier # n_neighbors: 8 # weights: uniform # algorithm: any # Max acc: 0.944 # AdaBoostClassifier # base_estimator:
# Gradient Boosting Classifier gb_cls = GBC(loss='deviance', max_features=None, learning_rate=0.125, n_estimators=150, min_samples_split=2, min_samples_leaf=20, max_depth=5, min_impurity_decrease=0.20, max_leaf_nodes=10, random_state=5) # Isolation Forest if_cls = IFc(random_state=5) if_param = {'n_estimators': [100, 200, 300], 'contamination': [0.05, 0.1, 0.2], 'max_features': [0.5, 0.75, 1.0], 'bootstrap': [True, False], 'behaviour': ['new']} #run_func(if_cls, if_param, 7, X, y, 'iso_frst') # Ada Boost Classifier ab_cls = ABC(random_state=5, algorithm='SAMME') ab_param = {'base_estimator': [lr_cls, rd_cls, rf_cls, gb_cls, et_cls], 'n_estimators': [10, 50, 100], 'learning_rate': [0.5, 1.0, 2.0]} # Bagging Classifier bg_cls = BGC(random_state=5) bg_param = {'base_estimator': [lr_cls, kn_cls, ld_cls, rd_cls, rf_cls, gb_cls, et_cls], 'n_estimators': [10, 50, 100], 'max_features': [0.25, 0.5, 0.75, 1.0], 'bootstrap': [True, False], 'bootstrap_features': [True, False]}
import SVM as CLF import numpy as np import matplotlib.pyplot as plt from sklearn.cross_validation import cross_val_score as cvs from sklearn.ensemble import AdaBoostClassifier as ABC df, salary, keys = CLF.clean(CLF.get_data()) estimators = [10, 20, 30, 40, 50, 100, 200, 400] estimator_scores = [] for estimator in estimators: clf = ABC(n_estimators=estimator) estimator_scores.append(cvs(clf, df, salary).mean()) learning_rates = [1, 10, 20, 30, 40, 50, 100, 200] learning_scores = [] best_estimator = estimators[estimator_scores.index(max(estimator_scores))] for rate in learning_rates: clf = ABC(n_estimators=best_estimator, learning_rate=rate) learning_scores.append(cvs(clf, df, salary).mean()) n_estimators = 400 # A learning rate of 1. may not be optimal for both SAMME and SAMME.R learning_rate = 1. fig = plt.figure() ax = fig.add_subplot(111)
def ada_init(n=100): ada = ABC(n_estimators = n) return ada
#!/usr/bin/env python if __name__ == '__main__': from sklearn.ensemble import AdaBoostClassifier as ABC from sklearn.tree import DecisionTreeClassifier as DTC import numpy as np from sklearn.metrics import accuracy_score from final_utils import read_hwfile # initialize data dat, lab, nDat = read_hwfile('ml14fall_train_align.dat.hog.dat', 169) nVal = nDat/5 nTrn = nDat-nVal datTrn = dat[:nTrn] labTrn = lab[:nTrn] datVal = dat[-nVal:] labVal = lab[-nVal:] print "#trn = {}, #val = {}".format(nTrn, nVal) classfier = ABC(DTC(max_depth=6, max_features=1), n_estimators=50000) classfier.fit(datTrn, labTrn) for i, labPre in enumerate(classfier.staged_predict(datVal)): if i % 10 == 9: print accuracy_score(labPre, labVal)
### stratified shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html ''' from sklearn.grid_search import GridSearchCV Using param_grid and GridSearchCV to tune the algorithm's parameters param_grid = { 'n_estimators': [3, 7, 9, 11, 15, 21, 23, 27], } clf = GridSearchCV(ABC(), param_grid) ''' #The parameters are hardcoded because when leaving the GridSearchCV and testing with #tester.py scores are low, but when hardcoded they are high as expected. clf = ABC(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0, n_estimators=23, random_state=None) clf.fit(X_train, y_train) #fitting the data ''' print "Best estimator found by grid search:" print clf.best_estimator_ ''' pred = clf.predict(X_test) acc = accuracy_score(pred, y_test) recall = recall_score(y_test, pred) precision = precision_score(y_test, pred) print 'Accuracy: ', acc print 'Recall: ', recall
if mf == 'None': mf = None mss = get_input('Please input value for min_samples_split:\t') from sklearn.ensemble import RandomForestClassifier as RFC clf = RFC(n_estimators=n, criterion=c, max_features=mf, min_samples_split=mss) parameters = ('n= ' + str(n)) + (', c= ' + c) + (', mf= ' + temp) + ( ', mss= ' + str(mss)) a, b, c, d = run_clf(clf) Classifiers_df.loc[counter] = ['RF', parameters, a, b, c, d] display(Classifiers_df.loc[counter]) counter += 1 elif user_input == 5: print 'You chose AdaBoost Classifier' n = get_input('Please input value for n_estimators:\t') from sklearn.ensemble import AdaBoostClassifier as ABC clf = ABC(n_estimators=n) parameters = ('n= ' + str(n)) a, b, c, d = run_clf(clf) Classifiers_df.loc[counter] = ['AB', parameters, a, b, c, d] display(Classifiers_df.loc[counter]) counter += 1 print_in() user_input = get_input('Please input a number:\n') display(Classifiers_df.sort_values(by=['Accuracy', 'Total'], ascending=[0, 1])) #KNN parameters nb= 3, w= distance, a= ball_tree, ls= 45, 131.224, 0.976109 #RF, n= 3, c= entropy, mf= None, mss= 15, 41.773, 0.98066
def SOmodelexp2(): modelExperiment( SOInsampleData, SOOutsampleData, 'stackoverflowdata/', fullFV, [LR(), RF(n_estimators=200), ABC()], ['LogisticRegression', 'RandomForest', 'AdaBoosted'], 'SOmodelExperiment2.csv', 'SOclassifier_plot2.png', True)
def __init__(self, n_estimators, matrix_database): self._matrix_database = matrix_database self._abc = ABC(n_estimators=n_estimators) self._has_fit = False
print("Checkpoint I") #create a model instance of naive-bayes naive_instance = nb() print("Checkpoint II") naive_instance.fit(X_train, Y_train) print("Classification Score for Naive-Bayes is -:", naive_instance.score(X_test, Y_test)) print("Checkpoint III") from sklearn.ensemble import AdaBoostClassifier as ABC #create a model instance of AdaBoost adaboost = ABC() adaboost.fit(X_train, Y_train) print("Classification Score for AdaBoost -: ", adaboost.score(X_test, Y_test)) print("Checkpoint IV") from sklearn.ensemble import RandomForestClassifier as rf #create a model instance of RandomForest clf = rf() clf.fit(X_train, Y_train) print("Classification Score for RandomForestClassifier -: ", clf.score(X_test, Y_test)) from sklearn.ensemble import ExtraTreesClassifier as etc #create a model instance of ExtraTreesClassifier extratrees = etc()