def test_staged_predict(): """Check staged predictions.""" # AdaBoost classification for alg in ['SAMME', 'SAMME.R']: clf = AdaBoostClassifier(algorithm=alg, n_estimators=10) clf.fit(iris.data, iris.target) predictions = clf.predict(iris.data) staged_predictions = [p for p in clf.staged_predict(iris.data)] proba = clf.predict_proba(iris.data) staged_probas = [p for p in clf.staged_predict_proba(iris.data)] score = clf.score(iris.data, iris.target) staged_scores = [s for s in clf.staged_score(iris.data, iris.target)] assert_equal(len(staged_predictions), 10) assert_array_almost_equal(predictions, staged_predictions[-1]) assert_equal(len(staged_probas), 10) assert_array_almost_equal(proba, staged_probas[-1]) assert_equal(len(staged_scores), 10) assert_array_almost_equal(score, staged_scores[-1]) # AdaBoost regression clf = AdaBoostRegressor(n_estimators=10, random_state=0) clf.fit(boston.data, boston.target) predictions = clf.predict(boston.data) staged_predictions = [p for p in clf.staged_predict(boston.data)] score = clf.score(boston.data, boston.target) staged_scores = [s for s in clf.staged_score(boston.data, boston.target)] assert_equal(len(staged_predictions), 10) assert_array_almost_equal(predictions, staged_predictions[-1]) assert_equal(len(staged_scores), 10) assert_array_almost_equal(score, staged_scores[-1])
def initialize(tree_depth=2, sample_size=300, sample_noise=0.2): # create dataset X, y = make_moons(n_samples=sample_size, noise=sample_noise) # fit classifier adaboost = AdaBoostClassifier( n_estimators=n_iterations, base_estimator=DecisionTreeClassifier(max_depth=tree_depth)) adaboost.fit(X, y) # get sample weights staged_classification = np.array(list(adaboost.staged_predict(X))) staged_missclassified = staged_classification != y staged_sample_weights = np.ones(shape=(n_iterations + 1, len(X))) / len(X) for istage in range(1, n_iterations + 1): estimator_weight = adaboost.estimator_weights_[istage - 1] sample_weight = staged_sample_weights[istage - 1].copy() incorrect = staged_missclassified[istage - 1] ############ code snippets from sklearn AdaboostClassifier source ############ # Only boost positive weights sample_weight *= np.exp(estimator_weight * incorrect * ((sample_weight > 0) | (estimator_weight < 0))) ############################################################################## sample_weight /= np.sum(sample_weight) staged_sample_weights[istage] = sample_weight # prepare to plot decision boundary h = .1 xrange = np.max(X[:, 0]) - np.min(X[:, 0]) yrange = np.max(X[:, 1]) - np.min(X[:, 1]) xs = np.arange( np.min(X[:, 0]) - xrange * 0.1, np.max(X[:, 0]) + xrange * 0.1, h) ys = np.arange( np.min(X[:, 1]) - yrange * 0.1, np.max(X[:, 1]) + yrange * 0.1, h) xx, yy = np.meshgrid(xs, ys) staged_zz = np.array( list(adaboost.staged_predict(np.c_[xx.ravel(), yy.ravel()]))) staged_zz = staged_zz.reshape(len(staged_zz), xx.shape[0], xx.shape[1]) # get estimators in the ensemble estimators = adaboost.estimators_ single_zz = np.zeros(shape=(len(estimators), xx.shape[0], xx.shape[1])) for iiter in range(1, n_iterations): next_estimator = estimators[iiter] next_zz = next_estimator.predict(np.c_[xx.ravel(), yy.ravel()]) next_zz = next_zz.reshape(xx.shape) single_zz[iiter] = next_zz globalvars = {} globalvars['X'] = X globalvars['y'] = y globalvars['staged_sample_weights'] = staged_sample_weights globalvars['xs'] = xs globalvars['ys'] = ys globalvars['staged_zz'] = staged_zz globalvars['single_zz'] = single_zz return globalvars
def test_staged_predict(): """Check staged predictions.""" # AdaBoost classification for alg in ['SAMME', 'SAMME.R']: clf = AdaBoostClassifier(algorithm=alg, n_estimators=10) clf.fit(iris.data, iris.target) predictions = clf.predict(iris.data) staged_predictions = [p for p in clf.staged_predict(iris.data)] proba = clf.predict_proba(iris.data) staged_probas = [p for p in clf.staged_predict_proba(iris.data)] score = clf.score(iris.data, iris.target) staged_scores = [s for s in clf.staged_score(iris.data, iris.target)] assert_equal(len(staged_predictions), 10) assert_array_almost_equal(predictions, staged_predictions[-1]) assert_equal(len(staged_probas), 10) assert_array_almost_equal(proba, staged_probas[-1]) assert_equal(len(staged_scores), 10) assert_array_almost_equal(score, staged_scores[-1]) # AdaBoost regression clf = AdaBoostRegressor(n_estimators=10) clf.fit(boston.data, boston.target) predictions = clf.predict(boston.data) staged_predictions = [p for p in clf.staged_predict(boston.data)] score = clf.score(boston.data, boston.target) staged_scores = [s for s in clf.staged_score(boston.data, boston.target)] assert_equal(len(staged_predictions), 10) assert_array_almost_equal(predictions, staged_predictions[-1]) assert_equal(len(staged_scores), 10) assert_array_almost_equal(score, staged_scores[-1])
def some(X, Y, X_test, Y_test): ada = AdaBoostClassifier() print "Train Model ---" t1 = time() ada.fit(X, Y) t2 = time() print "Model Trained ----------", t2 - t1 test_errors = [] cur = 1 Y_test2 = [] for k in Y_test: Y_test2.append(k[0]) print "Testing: " print Y_test2 pred = ada.predict(X_test) print pred accu = 1. - accuracy_score(y_true= Y_test2, y_pred= pred) print accu print "STAGED _____________" for test_predict in ( ada.staged_predict(X_test)): test_errors.append( 1. - accuracy_score(test_predict, Y_test2)) print "errorss : " print test_errors
def test_staged_predict(algorithm): # Check staged predictions. rng = np.random.RandomState(0) iris_weights = rng.randint(10, size=iris.target.shape) diabetes_weights = rng.randint(10, size=diabetes.target.shape) clf = AdaBoostClassifier(algorithm=algorithm, n_estimators=10) clf.fit(iris.data, iris.target, sample_weight=iris_weights) predictions = clf.predict(iris.data) staged_predictions = [p for p in clf.staged_predict(iris.data)] proba = clf.predict_proba(iris.data) staged_probas = [p for p in clf.staged_predict_proba(iris.data)] score = clf.score(iris.data, iris.target, sample_weight=iris_weights) staged_scores = [ s for s in clf.staged_score( iris.data, iris.target, sample_weight=iris_weights) ] assert len(staged_predictions) == 10 assert_array_almost_equal(predictions, staged_predictions[-1]) assert len(staged_probas) == 10 assert_array_almost_equal(proba, staged_probas[-1]) assert len(staged_scores) == 10 assert_array_almost_equal(score, staged_scores[-1]) # AdaBoost regression clf = AdaBoostRegressor(n_estimators=10, random_state=0) clf.fit(diabetes.data, diabetes.target, sample_weight=diabetes_weights) predictions = clf.predict(diabetes.data) staged_predictions = [p for p in clf.staged_predict(diabetes.data)] score = clf.score(diabetes.data, diabetes.target, sample_weight=diabetes_weights) staged_scores = [ s for s in clf.staged_score( diabetes.data, diabetes.target, sample_weight=diabetes_weights) ] assert len(staged_predictions) == 10 assert_array_almost_equal(predictions, staged_predictions[-1]) assert len(staged_scores) == 10 assert_array_almost_equal(score, staged_scores[-1])
def test_staged_predict(): # Check staged predictions. rng = np.random.RandomState(0) iris_weights = rng.randint(10, size=iris.target.shape) boston_weights = rng.randint(10, size=boston.target.shape) # AdaBoost classification for alg in ['SAMME', 'SAMME.R']: clf = AdaBoostClassifier(algorithm=alg, n_estimators=10) clf.fit(iris.data, iris.target, sample_weight=iris_weights) predictions = clf.predict(iris.data) staged_predictions = [p for p in clf.staged_predict(iris.data)] proba = clf.predict_proba(iris.data) staged_probas = [p for p in clf.staged_predict_proba(iris.data)] score = clf.score(iris.data, iris.target, sample_weight=iris_weights) staged_scores = [ s for s in clf.staged_score( iris.data, iris.target, sample_weight=iris_weights) ] assert_equal(len(staged_predictions), 10) assert_array_almost_equal(predictions, staged_predictions[-1]) assert_equal(len(staged_probas), 10) assert_array_almost_equal(proba, staged_probas[-1]) assert_equal(len(staged_scores), 10) assert_array_almost_equal(score, staged_scores[-1]) # AdaBoost regression clf = AdaBoostRegressor(n_estimators=10, random_state=0) clf.fit(boston.data, boston.target, sample_weight=boston_weights) predictions = clf.predict(boston.data) staged_predictions = [p for p in clf.staged_predict(boston.data)] score = clf.score(boston.data, boston.target, sample_weight=boston_weights) staged_scores = [ s for s in clf.staged_score( boston.data, boston.target, sample_weight=boston_weights) ] assert_equal(len(staged_predictions), 10) assert_array_almost_equal(predictions, staged_predictions[-1]) assert_equal(len(staged_scores), 10) assert_array_almost_equal(score, staged_scores[-1])
def test_staged_predict(): # Check staged predictions. rng = np.random.RandomState(0) iris_weights = rng.randint(10, size=iris.target.shape) boston_weights = rng.randint(10, size=boston.target.shape) # AdaBoost classification for alg in ['SAMME', 'SAMME.R']: clf = AdaBoostClassifier(algorithm=alg, n_estimators=10) clf.fit(iris.data, iris.target, sample_weight=iris_weights) predictions = clf.predict(iris.data) staged_predictions = [p for p in clf.staged_predict(iris.data)] proba = clf.predict_proba(iris.data) staged_probas = [p for p in clf.staged_predict_proba(iris.data)] score = clf.score(iris.data, iris.target, sample_weight=iris_weights) staged_scores = [ s for s in clf.staged_score( iris.data, iris.target, sample_weight=iris_weights)] assert_equal(len(staged_predictions), 10) assert_array_almost_equal(predictions, staged_predictions[-1]) assert_equal(len(staged_probas), 10) assert_array_almost_equal(proba, staged_probas[-1]) assert_equal(len(staged_scores), 10) assert_array_almost_equal(score, staged_scores[-1]) # AdaBoost regression clf = AdaBoostRegressor(n_estimators=10, random_state=0) clf.fit(boston.data, boston.target, sample_weight=boston_weights) predictions = clf.predict(boston.data) staged_predictions = [p for p in clf.staged_predict(boston.data)] score = clf.score(boston.data, boston.target, sample_weight=boston_weights) staged_scores = [ s for s in clf.staged_score( boston.data, boston.target, sample_weight=boston_weights)] assert_equal(len(staged_predictions), 10) assert_array_almost_equal(predictions, staged_predictions[-1]) assert_equal(len(staged_scores), 10) assert_array_almost_equal(score, staged_scores[-1])
def adaBoostKNN(self, n_estimators, n_neighbors): #adaBoost with decision tree X_train = self.X_train y_train = self.y_train X_test = self.X_test y_test = self.y_test real_test_errors = [] clf = KNeighborsClassifier(n_neighbors=n_neighbors) clf.sample_weight = np.ones(len(y_train))/len(y_train) bdt_real = AdaBoostClassifier(clf, n_estimators=n_estimators, learning_rate=1) bdt_real.fit(X_train, y_train) min_error = 1.0 count = 0 for real_test_predict in bdt_real.staged_predict(X_test): error = 1.0 - accuracy_score(real_test_predict, y_test) if error < min_error: min_error = error min_error_pred = real_test_predict min_error_n = count real_test_errors.append(error) count += 1 n_trees_real = len(bdt_real) real_estimator_errors = bdt_real.estimator_errors_[:n_trees_real] plt.figure(figsize=(15, 5)) plt.subplot(1, 2, 1) plt.plot(range(1, n_trees_real + 1), real_test_errors, c='black', linestyle='dashed', label='SAMME.R') plt.legend() # plt.ylim(0.18, 0.62) plt.ylabel('Test Error') plt.xlabel('Number of Trees') plt.subplot(1, 2, 2) plt.plot(range(1, n_trees_real + 1), real_estimator_errors, "r", label='SAMME.R', alpha=.5) plt.legend() plt.ylabel('Error') plt.xlabel('Number of Trees') plt.ylim((.2, real_estimator_errors.max() * 1.2)) plt.xlim((-20, len(bdt_real) + 20)) # prevent overlapping y-axis y_train plt.subplots_adjust(wspace=0.25) plt.show() return min_error_pred, min_error_n
def solution(filename): # Считывание данных data = pd.read_csv(filename + '.csv') # data['class'] = pd.factorize(data['class'])[0] y = data['class'].values x = data.drop(['class'], axis=1).values # Разбиение данных на тестовую и тренировочную выборки # x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15) # Алгоритм бустинга if filename == 'chips': learning_rate = 0.15 else: learning_rate = 1.15 model = AdaBoostClassifier(learning_rate=learning_rate, n_estimators=150) model.fit(x, y) # Построение графика для каждого шага алгоритма plt.figure() plt.scatter(x[:, 0], x[:, 1], c=colors(y), cmap=plt.cm.Paired) ax = plt.gca() xlim = ax.get_xlim() ylim = ax.get_ylim() xx = np.linspace(xlim[0], xlim[1], 30) yy = np.linspace(ylim[0], ylim[1], 30) YY, XX = np.meshgrid(yy, xx) xy = np.vstack([XX.ravel(), YY.ravel()]).T y_pred = model.staged_predict(x) decision_funcs = model.staged_decision_function(xy) i = 1 for _, func in zip(y_pred, decision_funcs): plt.clf() plt.scatter(x[:, 0], x[:, 1], c=colors(y), cmap=plt.cm.Paired) f = func.reshape(XX.shape) ax = plt.gca() ax.contour(XX, YY, f, colors='k', levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--']) plt.legend([f'Step number {i}']) plt.savefig(filename + '/' + str(i) + '.png') i += 1
def show(): # 设置AdaBoost迭代次数 n_estimators = 200 # 使用 X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) # 从12000个数据中取前2000行作为测试集,其余作为训练集 train_x, train_y = X[2000:], y[2000:] test_x, test_y = X[:2000], y[:2000] # 弱分类器 dt_stump = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1) dt_stump.fit(train_x, train_y) dt_stump_err = 1.0 - dt_stump.score(test_x, test_y) # 决策树分类器 dt = DecisionTreeClassifier() dt.fit(train_x, train_y) dt_err = 1.0 - dt.score(test_x, test_y) # AdaBoost分类器 ada = AdaBoostClassifier(base_estimator=dt_stump, n_estimators=n_estimators) ada.fit(train_x, train_y) # 三个分类器的错误率可视化 fig = plt.figure() # 设置plt正确显示中文 plt.rcParams['font.sans-serif'] = ['SimHei'] ax = fig.add_subplot(111) ax.plot([1, n_estimators], [dt_stump_err] * 2, 'k-', label=u'决策树弱分类器 错误率') ax.plot([1, n_estimators], [dt_err] * 2, 'k--', label=u'决策树模型 错误率') ada_err = np.zeros((n_estimators, )) # 遍历每次迭代的结果 i为迭代次数, pred_y为预测结果 for i, pred_y in enumerate(ada.staged_predict(test_x)): # 统计错误率 ada_err[i] = zero_one_loss(pred_y, test_y) # 绘制每次迭代的AdaBoost错误率 ax.plot(np.arange(n_estimators) + 1, ada_err, label='AdaBoost Test 错误率', color='orange') ax.set_xlabel('迭代次数') ax.set_ylabel('错误率') leg = ax.legend(loc='upper right', fancybox=True) plt.show()
def ensembleProc(n_estimators, learning_rate, trainfile, testfile): features = np.genfromtxt(trainfile, delimiter=' ', usecols=(0, 1, 2)) labels = np.genfromtxt(trainfile, delimiter=' ', usecols=(-1)) tests = np.genfromtxt(testfile, delimiter=' ', usecols=(0, 1, 2)) testlabels = np.genfromtxt(testfile, delimiter=' ', usecols=(-1)) dt_stump = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1) dt_stump.fit(features, labels) ada_real = AdaBoostClassifier( base_estimator=dt_stump, learning_rate=learning_rate, n_estimators=n_estimators, algorithm="SAMME") ada_real.fit(features, labels) error = np.zeros((n_estimators,)) for i, predict in enumerate(ada_real.staged_predict(tests)): error[i] = zero_one_loss(predict, testlabels) return np.mean(error)
def adaboost(): X_train, y_train = read('train') X_test, y_test = read('test') X_train = X_train.reshape((X_train.shape[0], -1)) X_test = X_test.reshape((X_test.shape[0], -1)) bdt_discrete = AdaBoostClassifier(DecisionTreeClassifier( max_depth=10, min_samples_split=20, min_samples_leaf=5), n_estimators=500, learning_rate=0.5, algorithm='SAMME') bdt_discrete.fit(X_train, y_train) discrete_test_errors = [] for discrete_train_predict in bdt_discrete.staged_predict(X_test): discrete_test_errors.append( 1. - accuracy_score(discrete_train_predict, y_test)) return bdt_discrete, discrete_test_errors
print("adaboost classifier training in %.2f" % (time() - start)) # use cross-validation to estimate accuracy # start = time() # train_pred = cross_val_predict(ada_clf, bag_of_words, train.cuisine, cv=2) # print("adaboost evaluation finished in %.2f" % (time() - start)) # print("Estimated accuracy using cross-validation: " , accuracy_score(train.cuisine, train_pred)) # use rest of labelled training data to check accuracy score (for plotting) test = pd.read_json("data/train2.json") test_words = [" ".join(item) for item in test.ingredients] test_bag = vec.transform(test_words).toarray() test_errors = [] for test_predict in ada_clf.staged_predict(test_bag): test_errors.append(1.0 - accuracy_score(test_predict, test.cuisine)) plt.figure(figsize=(15, 5)) plt.plot(range(1, len(ada_clf) + 1), test_errors) plt.ylabel("Test Error") plt.xlabel("Number of Trees") plt.show() # Load in Testing Data test = pd.read_json("data/test.json") # Create test Bag of Words test_words = [" ".join(item) for item in test.ingredients] test_bag = vec.transform(test_words).toarray()
# Now predict the value of the digit on the second half: predicted = classifier.predict(X_test) r_predicted = r_classifier.predict(X_test) print("Classification report for classifier %s:\n%s\n" % (classifier, metrics.classification_report(y_test, predicted))) print("Confusion matrix:\n%s" % metrics.confusion_matrix(y_test, predicted)) print("Classification report for classifier %s:\n%s\n" % (r_classifier, metrics.classification_report(y_test, r_predicted))) print("Confusion matrix:\n%s" % metrics.confusion_matrix(y_test, r_predicted)) n_trees = xrange(1, len(classifier) + 1) test_errors = [] train_errors = [] for p in classifier.staged_predict(X_test): test_errors.append(1. - accuracy_score(p, y_test)) for p in classifier.staged_predict(X_train): train_errors.append(1. - accuracy_score(p, y_train)) test_errors_rand = [] for i in xrange(1, args.estimators + 1): print '.', r_classifier = RandomForestClassifier(n_estimators=i, n_jobs=args.jobs, max_depth=args.max_depth) r_classifier.fit(X_train, y_train) r_predicted = r_classifier.predict(X_test) test_errors_rand.append(1. - accuracy_score(r_predicted, y_test)) print '.' pl.subplot(1,1,1) pl.plot(n_trees, test_errors, c='red', label='AdaBoost.%s' % args.boost)
test_word_arrayLabel.append(label[randomIndex]) test_word_array.append(train_mood_array[randomIndex]) del (train_mood_array[randomIndex]) del (label[randomIndex]) except Exception as e: print(e) multi = MultinomialNB() multi.fit(train_mood_array, label) multi.predict(test_word_array) ada_real = AdaBoostClassifier( base_estimator=multi, learning_rate=learning_rate, n_estimators=n_estimators) ada_real.fit(train_mood_array, label) ada_real_err = np.zeros((n_estimators,)) for i, y_pred in enumerate(ada_real.staged_predict(test_word_array)): ada_real_err[i] = zero_one_loss(y_pred, test_word_arrayLabel) print(ada_real_err[i]) # ROC start X_train = train_mood_array X_test = test_word_array y_train = label y_test = test_word_arrayLabel y = label y_score = ada_real.predict_proba(test_word_array) fpr = dict() tpr = dict() roc_auc = dict() for i in [0, 1]: fpr[i], tpr[i], _ = roc_curve(test_word_arrayLabel, y_score[:, 0], pos_label=1)
def bdtModel(df_sig_train, df_bkg_train, df_sig_test, df_bkg_test): # '---------- Prepare Training ----------' X_sig = np.array(df_sig_train) y_sig = np.array(X_sig.shape[0] * [1]) X_bkg = np.array(df_bkg_train) y_bkg = np.array(X_bkg.shape[0] * [0]) X = np.concatenate((X_sig, X_bkg)) y = np.concatenate((y_sig, y_bkg)) print 'X_sig.shape: ', X_sig.shape print 'y_sig.shape: ', y_sig.shape print 'X_bkg.shape: ', X_bkg.shape print 'y_bkg.shape: ', y_bkg.shape print 'X.shape: ', X.shape print 'y.shape: ', y.shape # '---------- Prepare Testing ----------' X_sig_test = np.array(df_sig_test) y_sig_test = np.array(X_sig_test.shape[0] * [1]) X_bkg_test = np.array(df_bkg_test) y_bkg_test = np.array(X_bkg_test.shape[0] * [0]) X_test = np.concatenate((X_sig_test, X_bkg_test)) y_test = np.concatenate((y_sig_test, y_bkg_test)) print 'X_sig_test.shape: ', X_sig_test.shape print 'y_sig_test.shape: ', y_sig_test.shape print 'X_bkg_test.shape: ', X_bkg_test.shape print 'y_bkg_test.shape: ', y_bkg_test.shape print 'X_test.shape: ', X_test.shape print 'y_test.shape: ', y_test.shape # '---------- Model ----------' #scaler = preprocessing.StandardScaler().fit(X) #X = scaler.transform(X) #model = svm.SVC(C = 50, kernel = 'rbf', tol=0.001, gamma=0.005, probability=True) #model.fit(X, y) dt = DecisionTreeClassifier(max_depth=3, min_samples_leaf=0.05*len(X)) model = AdaBoostClassifier(dt, algorithm='SAMME', n_estimators=400, learning_rate=0.5) model.fit(X, y) print '---------- Training/Testing info ----------' print 'Accuracy (training): ', model.score(X, y) print 'Null Error Rate (training): ', y.mean() #X_test = scaler.transform(X_test) predicted_test = model.predict(X_test) predicted_test_clever = (predicted_test + y_test).tolist() error_test = float(predicted_test_clever.count(1)) / float(len(predicted_test_clever)) print "Error: ", error_test print "Accuracy (testing): ", metrics.accuracy_score(y_test, predicted_test) print "Recall (testing): ", metrics.recall_score(y_test, predicted_test) print "F1 score (testing): ", metrics.f1_score(y_test, predicted_test) print "ROC area under curve (testing): ", metrics.roc_auc_score(y_test, predicted_test) #'PTS','AST','REB','STL','BLK','FG_PCT','FG3_PCT','FT_PCT','MIN','EFF','WL'] #user_input = scaler.transform(np.array([10, 1, 2, 0, 2, 0.3, 0.3, 0.3, 10, 5, 1], dtype=float)) #user_input = scaler.transform(np.array([10,1,2,2,2,2,2,2,2,2,1], dtype=float)) #user_input = scaler.transform(np.array([10,1,2], dtype=float)) user_input = np.array([10.15, 1.95, 6.77, 1.12, 0.28, 0.51, 0.37, 0.47, 32.5, 14.8, 0.53], dtype=float) score = model.decision_function(user_input) print 'Score (user input): ', score result = model.predict_proba(user_input) print 'Probability of 1 (user input): ', result # '--------- Visualization -----------' Classifier_training_S = model.decision_function(X[y>0.5]).ravel() Classifier_training_B = model.decision_function(X[y<0.5]).ravel() Classifier_testing_S = model.decision_function(X_test[y_test>0.5]).ravel() Classifier_testing_B = model.decision_function(X_test[y_test<0.5]).ravel() (h_test_s, h_test_b) = visualSigBkg("BDT", Classifier_training_S, Classifier_training_B, Classifier_testing_S, Classifier_testing_B) # '-------- Variable Importance ---------' feature_importance = model.feature_importances_ # make importances relative to max importance feature_importance = 100.0 * (feature_importance / feature_importance.max()) sorted_idx = np.argsort(feature_importance) pos = np.arange(sorted_idx.shape[0]) + .5 mpl.style.use('ggplot') pl.subplot(1, 2, 2) pl.barh(pos, feature_importance[sorted_idx], align='center') pl.yticks(pos, df_sig_train.columns[sorted_idx]) pl.xlabel('Relative Importance', fontsize=15) pl.title('Variable Importance', fontsize=15) #pl.show() plt.savefig("Var_importance.pdf") plt.close() fig = plt.figure() ax = fig.add_subplot(111) model_err = np.zeros((400,)) for i, y_pred in enumerate(model.staged_predict(X_test)): model_err[i] = zero_one_loss(y_pred, y_test) model_err_train = np.zeros((400,)) for i, y_pred in enumerate(model.staged_predict(X)): model_err_train[i] = zero_one_loss(y_pred, y) ax.plot(np.arange(400) + 1, model_err, label='AdaBoost Test Error', color='orange') ax.plot(np.arange(400) + 1, model_err_train, label='AdaBoost Train Error', color='green') ax.set_ylim((0.25, 0.35)) ax.set_xlabel('Number of Trees') ax.set_ylabel('Error Rate') leg = ax.legend(loc='upper right', fancybox=True) leg.get_frame().set_alpha(0.7) plt.savefig("ntrees.pdf") plt.close() ########################################################### return (model, X, y, result, model.score(X, y), error_test, score, h_test_s, h_test_b)
dtree = DecisionTreeClassifier(max_depth=1) """ base_estimator=None, 子模型类型 n_estimators=50, 子模型个数 learning_rate=1., 学习步长,缩放因子 algorithm='SAMME.R', random_state=None): """ algo = AdaBoostClassifier(base_estimator=dtree, n_estimators=10) # 模型训练 algo.fit(X_train, y_train) # 模型效果评估 print('训练集上的准确率:{}'.format(algo.score(X_train, y_train))) print('测试集上的准确率:{}'.format(algo.score(X_test, y_test))) x_test = [[6.9, 3.1, 5.1, 2.3], [6.1, 2.8, 4.0, 1.3], [5.2, 3.4, 1.4, 0.2]] print('样本预测值:') print(algo.predict(x_test)) print("样本的预测概率值:") print(algo.predict_proba(x_test)) print("样本的预测概率值的Log转换值:") print(algo.predict_log_proba(x_test)) print("训练好的所有子模型:\n{}".format(algo.estimators_)) x_test = [[6.9, 3.1, 5.1, 2.3], [6.1, 2.8, 4.0, 1.3], [5.2, 3.4, 2.9, 0.8]] generator = algo.staged_predict(x_test) print('阶段预测值:') for i in generator: print(i) print('各特征属性权重列表:{}'.format(algo.feature_importances_))
label=label[index] (X_train,X_test)=(data[0:30000],data[30000:]) (y_train,y_test)=(label[0:30000],label[3000:]) #X_train, X_test = getPics().trainData,getPics().testData #y_train, y_test = getPics().trainLabel,getPics().testLabel #print X_train.shape #print y_train.shape bdt_discrete = AdaBoostClassifier( CnnModel(), n_estimators=500, learning_rate=0.3, algorithm="SAMME") bdt_discrete.fit(X_train, y_train) discrete_test_errors = [] for discrete_train_predict in bdt_discrete.staged_predict(X_test): discrete_test_errors.append( 1. - accuracy_score(discrete_train_predict, y_test)) n_trees_discrete = len(bdt_discrete) discrete_estimator_errors = bdt_discrete.estimator_errors_[:n_trees_discrete] discrete_estimator_weights = bdt_discrete.estimator_weights_[:n_trees_discrete] plt.figure(figsize=(15, 5)) plt.subplot(131) plt.plot(range(1, n_trees_discrete + 1), discrete_test_errors, c='black') plt.legend() plt.ylim(0.18, 0.62) plt.ylabel('Test Error')
from sklearn.ensemble import AdaBoostClassifier from read_data import read_data def results_from_examples(ps,ls): return [1 if p == l else 0 for p,l in zip(ps,ls)] def error_rate(rs): return 1.0-((1.0*sum(rs))/len(rs)) print "Sklearn" examples,labels = read_data('Data/clean1_clean.data') clf = AdaBoostClassifier(n_estimators=50) a = AdaBoostClassifier.fit(clf,examples,labels) score = a.score(examples, labels) i = 0 print "Estimator, Ensemble error, Classifier error" for value in AdaBoostClassifier.staged_predict(clf, examples): rs = results_from_examples(value, labels) #print "Estimator: " + str(i) + " Ensemble error: " + str(error_rate(rs)) + " Classifier error: " + str(clf.estimator_errors_[i]) print str(i) + "," + str(error_rate(rs)) + "," + str(clf.estimator_errors_[i]) i = i + 1 print score
targetValues = data["class"].values attributes = data.drop(["class"], axis=1).values attributesTrain, attributesTest, targetValuesTrain, targetValueTest = train_test_split( attributes, targetValues, test_size=0.1) bestAccuracy, bestLearningRate, bestIteration = 0, 0, 0 for i in range(0, 100): learningRate = 1e-3 * 1.1**i classifier = AdaBoostClassifier(learning_rate=learningRate, n_estimators=100) classifier.fit(attributesTrain, targetValuesTrain) targetValuePredictedFunctions = classifier.staged_predict(attributes) scoreFunctions = classifier.staged_score(attributesTest, targetValueTest) for j, item in enumerate(scoreFunctions, start=0): if (item > bestAccuracy): bestAccuracy = item bestLearningRate = learningRate bestIteration = j print(bestLearningRate) print(bestAccuracy) print(bestIteration) classifier = AdaBoostClassifier(learning_rate=bestLearningRate, n_estimators=100) classifier.fit(attributesTrain, targetValuesTrain)
train_x, train_y = X[:2000], y[:2000] # 弱分类器 dt_stump = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1) dt_stump.fit(train_x, train_y) dt_stump_err = 1.0 - dt_stump.score(test_x, test_y) # 决策树分类器 dt = DecisionTreeClassifier() dt.fit(train_x, train_y) dt_err = 1.0 - dt.score(test_x, test_y) # AdaBoost 分类器 ada = AdaBoostClassifier(base_estimator=dt_stump, n_estimators=n_estimators) ada.fit(train_x, train_y) # 三个分类器的错误率可视化 fig = plt.figure() # 设置 plt 正确显示中文 plt.rcParams['font.sans-serif'] = ['SimHei'] ax = fig.add_subplot(111) ax.plot([1, n_estimators], [dt_stump_err] * 2, 'k-', label=u'决策树弱分类器 错误率') ax.plot([1, n_estimators], [dt_err] * 2, 'k--', label=u'决策树模型 错误率') ada_err = np.zeros((n_estimators,)) # 遍历每次迭代的结果 i 为迭代次数, pred_y 为预测结果 for i, pred_y in enumerate(ada.staged_predict(test_x)): # 统计错误率 ada_err[i] = zero_one_loss(pred_y, test_y) # 绘制每次迭代的 AdaBoost 错误率 ax.plot(np.arange(n_estimators) + 1, ada_err, label='AdaBoost Test 错误率', color='orange') ax.set_xlabel('迭代次数') ax.set_ylabel('错误率') leg = ax.legend(loc='upper right', fancybox=True) plt.show()
#!/usr/bin/env python if __name__ == '__main__': from sklearn.ensemble import AdaBoostClassifier as ABC from sklearn.tree import DecisionTreeClassifier as DTC import numpy as np from sklearn.metrics import accuracy_score from final_utils import read_hwfile # initialize data dat, lab, nDat = read_hwfile('ml14fall_train_align.dat.hog.dat', 169) nVal = nDat/5 nTrn = nDat-nVal datTrn = dat[:nTrn] labTrn = lab[:nTrn] datVal = dat[-nVal:] labVal = lab[-nVal:] print "#trn = {}, #val = {}".format(nTrn, nVal) classfier = ABC(DTC(max_depth=6, max_features=1), n_estimators=50000) classfier.fit(datTrn, labTrn) for i, labPre in enumerate(classfier.staged_predict(datVal)): if i % 10 == 9: print accuracy_score(labPre, labVal)
n_split = 3000 X_train, X_test = X[n_split:], X[:n_split] Y_train, Y_test = y[n_split:], y[:n_split] clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), algorithm="SAMME.R", n_estimators=600) clf.fit(X_train, Y_train) train_errors = [] test_errors = [] for train_error, test_error in zip(clf.staged_predict(X_train), clf.staged_predict(X_test)): train_errors.append(1 - accuracy_score(train_error, Y_train)) test_errors.append(1 - accuracy_score(test_error, Y_test)) x = np.linspace(1, 600, 600) plt.style.use('ggplot') plt.plot(x, train_errors, 'r', label='SAMME.R Train Error') plt.plot(x, test_errors, 'b', label='SAMME.R Test Error') plt.legend(loc='upper right', fancybox=True) plt.xlabel('Eestimator Numbers') plt.ylabel('Error Rate') # plt.show() plt.savefig('error_rate.png', dpi=250) # https://www.zybuluo.com/yxd/note/614495
def train_net(self, model_s, batch_size = 128, epochs = 20 ): n_classes = 2 # self.reduce_data(20) # # # Generate new instances to fix any class imbalance(relevant for (16,) set) # sm = SMOTE() # self.X, self.labels = sm.fit_resample(self.X, self.labels) # Recalculate energy for SMOTEd instances # self.restore_energy_labels() # if self.verbose: # print('Done SMOTEing') # Test/train split x_train, x_test, y_train, y_test = train_test_split(self.X, self.labels, test_size = .2, shuffle = True) if self.verbose: print('Training balance: %.2f. Testing balance: %.2f' % (np.sum(y_train)/len(y_train), np.sum(y_test)/len(y_test))) input_shape = None if is_cnn(model_s): grey2rgb = requires_rgb(model_s) x_train, input_shape = self.prepare_X_for_cnn(x_train, grey2rgb) x_test, _ = self.prepare_X_for_cnn(x_test, grey2rgb) # convert class vectors to binary class matrices y_train = keras.utils.to_categorical(y_train, n_classes) y_test = keras.utils.to_categorical(y_test, n_classes) # Squawk if desired if self.verbose: print('x_train shape:', x_train.shape) print(x_train.shape[0], 'train samples') print(x_test.shape[0], 'test samples') # Get it model = get_model(model_s, input_shape) """ CNN will likely overfit XY states, at least on L = 7 lattice. Hence we need early stopping. Patience is set to epochs such that we keep looking for the best model over all epochs. """ es = EarlyStopping(monitor = 'val_loss', mode = 'min', patience = epochs, verbose = 1) # We also want to store our best model, as judged by accuracy mc = ModelCheckpoint('Models/Epoch{epoch:02d}_Acc{val_acc:.2f}_V%d_L%d_M%d_N%d_%s.h5' % (int(self.X_vortex), self.L, self.M, self.N, model_s) , monitor='val_acc', mode='max', verbose=1, save_best_only=True) # Check for boosting if self.boost_nn and is_nn(model_s): # Different convention for labels. AdaBoostClassifier expects Y to be of form (nsamples,) # This in turn means models in get_model must be modified _WHEN_ used in conjuction with AdaBoostClf y_test = y_test[:, 0] + y_test[:, 1]*-1 y_train = y_train[:, 0] + y_train[:, 1]*-1 y_test = (y_test+1)/2 y_train = (y_train+1)/2 build = lambda: get_model(model_s, input_shape) est = KerasClassifier(build_fn = build, epochs = epochs, batch_size = batch_size, verbose = 0) model = AdaBoostClassifier(base_estimator = est, n_estimators = 1) x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = .1) print(x_train.shape, y_train.shape) model.fit(x_train, y_train) self.MODEL = model self.XTE = x_test # Need to construct our own history manually pred_val = model.staged_predict(x_val) pred_tr = model.staged_predict(x_train) accs_val = [] accs_train = [] for predv, predr in zip(pred_val, pred_tr): accs_val.append(accuracy_score(predv, y_val)) accs_train.append(accuracy_score(predr, y_train)) # Bit lazy, but using accuracy is less hassle. But then we need to trick ourselves: history = Bunch() history.history = {'loss': accs_train, 'val_loss': accs_val } score = (-1, accuracy_score(model.predict(x_test), y_test)) # If it's an AdaBoosted neural net, we won't do early stopping or save/load. # It's hackish, but we just store it in instance. Why? Because we already know # it'll perform worse than a CNN, so it's not worth the effort at the moment. self.model_adaboost = model else: # Fit and record history history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, callbacks = [es, mc], validation_split = 0.1) # Get the score on the unseen test set score = model.evaluate(x_test, y_test, verbose=0) # Squawk if desired if self.verbose: print('Test loss:', score[0]) print('Test accuracy:', score[1]) y_true = y_test[:, 1].astype(int) y_pred = np.round(model.predict(x_test)[:, 1]).astype(int) self.AA = y_true self.BB = y_pred print(classification_report(y_true, y_pred)) self.f1 = f1_score(y_true, y_pred) print('F1-score: %.3f' % self.f1) print(confusion_matrix(y_true, y_pred)) self.rocauc = roc_auc_score(y_true, y_pred) self.accuracy = accuracy_score(y_true, y_pred) # Plot training history fig = plt.figure() ax = fig.add_subplot(111) ax.plot(history.history['loss'], label = 'train') ax.plot(history.history['val_loss'], label = 'val') ax.set_xlabel('Epoch') ax.set_ylabel('Loss') if is_nn(model_s) and self.boost_nn: ax.set_ylabel('Accuracy') ax.set_title('Model: %s, Test score: %.3f' % (model_s, score[1])) ax.legend() # Save the plot to file plt.savefig('Plots/TrainTestScores/V%d_L%d_M%d_N%d_%s.png' % (int(self.X_vortex), self.L, self.M, self.N, model_s) ) # Save a graph of the model plot_model(model, to_file = 'Plots/Model Graphs/%s.png' % (model_s) ) # And show plot if desired if self.plotty: plt.show()
58,115,99,84,132,13,35,77,89,113,102,36,38, 131,39,94,5,66,2,134,51,96,24,114,121,120,46] print(selectedBands) selectedArray = data[:, selectedBands] # Making AdaBoost bdt_real = AdaBoostClassifier( DecisionTreeClassifier(max_depth=3), n_estimators=500, learning_rate=1) bdt_real.fit(selectedArray[np.where(train != 0)[0]], train[train != 0]) real_test_errors = [] for real_test_predict in \ bdt_real.staged_predict(selectedArray[np.where(test != 0)[0]]): real_test_errors.append(1. - accuracy_score(real_test_predict, test[test != 0])) n_trees_real = len(bdt_real) real_estimator_errors = bdt_real.estimator_errors_[:n_trees_real] classified = bdt_real.predict(selectedArray[np.where(test != 0)[0]]) score = accuracy_score(classified, test[test != 0]) print(score) plt.figure(figsize=(15, 5)) plt.plot(range(1, n_trees_real + 1), real_test_errors, c='black', linestyle='dashed', label='SAMME.R') plt.show()
#split into training and testing samples. test_size = proportion of data used for test x_train, x_test, y_train, y_test = train_test_split(value_to_classify, targs_to_classify, test_size = .4) ######################### #ADABoost Classifier ######################### bdt_real = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),n_estimators=600,learning_rate=1) bdt_discrete = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),n_estimators=600,learning_rate=1.5,algorithm="SAMME") bdt_real.fit(x_train, y_train) bdt_discrete.fit(x_train, y_train) real_test_errors = [] discrete_test_errors = [] for real_test_predict, discrete_train_predict in zip(bdt_real.staged_predict(x_test), bdt_discrete.staged_predict(x_test)): real_test_errors.append(1. - accuracy_score(real_test_predict, y_test)) discrete_test_errors.append(1. - accuracy_score(discrete_train_predict, y_test)) n_trees_discrete = len(bdt_discrete) n_trees_real = len(bdt_real) # Boosting might terminate early, but the following arrays are always # n_estimators long. We crop them to the actual number of trees here: discrete_estimator_errors = bdt_discrete.estimator_errors_[:n_trees_discrete] real_estimator_errors = bdt_real.estimator_errors_[:n_trees_real] discrete_estimator_weights = bdt_discrete.estimator_weights_[:n_trees_discrete] # Test on the testing data set and display the accuracies ypred_r = bdt_real.predict(x_test) ypred_e = bdt_discrete.predict(x_test)
try: randomIndex = int(random.uniform(0, len(train_mood_array))) test_word_arrayLabel.append(label[randomIndex]) test_word_array.append(train_mood_array[randomIndex]) del (train_mood_array[randomIndex]) del (label[randomIndex]) except Exception as e: print(e) multi = MultinomialNB() ada_real = AdaBoostClassifier(base_estimator=multi, learning_rate=learning_rate, n_estimators=n_estimators, algorithm="SAMME.R") ada_real.fit(train_mood_array, label) ada_real_err = np.zeros((n_estimators, )) # 变成一个一维的矩阵,长度为n for i, y_pred in enumerate(ada_real.staged_predict(test_word_array)): # 测试 ada_real_err[i] = zero_one_loss(y_pred, test_word_arrayLabel) # 得出不同的,然后除于总数 ada_real_err_train = np.zeros((n_estimators, )) for i, y_pred in enumerate( ada_real.staged_predict(train_mood_array)): # 训练样本对训练样本的结果 ada_real_err_train[i] = zero_one_loss(y_pred, label) def test(word): word_array = bayes.build_word_array(word) asfaiajioaf = bayes.setOfWordsListToVecTor(vocabList, word_array) return ada_real.predict(asfaiajioaf)[0] def testandscore(word):
learning_rate=1) bdt_discrete = AdaBoostClassifier( DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1.5, algorithm="SAMME") bdt_real.fit(X_train, y_train) bdt_discrete.fit(X_train, y_train) real_test_errors = [] discrete_test_errors = [] for real_test_predict, discrete_train_predict in zip( bdt_real.staged_predict(X_test), bdt_discrete.staged_predict(X_test)): real_test_errors.append( 1. - accuracy_score(real_test_predict, y_test)) discrete_test_errors.append( 1. - accuracy_score(discrete_train_predict, y_test)) n_trees_discrete = len(bdt_discrete) n_trees_real = len(bdt_real) # Boosting might terminate early, but the following arrays are always # n_estimators long. We crop them to the actual number of trees here: discrete_estimator_errors = bdt_discrete.estimator_errors_[:n_trees_discrete] real_estimator_errors = bdt_real.estimator_errors_[:n_trees_real] discrete_estimator_weights = bdt_discrete.estimator_weights_[:n_trees_discrete] plt.figure(figsize=(15, 5))
X, y = make_gaussian_quantiles(n_samples=13000, n_features=10,n_classes=3, random_state=1) n_split = 3000 X_train, X_test = X[:n_split], X[n_split:] y_train, y_test = y[:n_split], y[n_split:] bdt_real = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),n_estimators=600,learning_rate=1) bdt_discrete = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),n_estimators=600,learning_rate=1.5,algorithm="SAMME") bdt_real.fit(X_train, y_train) bdt_discrete.fit(X_train, y_train) real_test_error = [] discrete_test_error = [] for rea_test_predict in bdt_real.staged_predict(X_test): real_test_error.append(1. - accuracy_score(rea_test_predict, y_test)) for discrete_test_predict in bdt_discrete.staged_predict(X_test): discrete_test_error.append(1. - accuracy_score(discrete_test_predict, y_test)) n_trees_discrete = len(bdt_discrete) n_trees_real = len(bdt_real) discrete_estimator_errors = bdt_discrete.estimator_errors_[:n_trees_discrete] real_estimator_errors = bdt_real.estimator_errors_[:n_trees_real] discrete_estimator_weights = bdt_discrete.estimator_weights_[:n_trees_discrete] plt.figure(figsize=(15,5)) plt.subplot(131) plt.plot(range(1, n_trees_discrete + 1), discrete_test_error, c='black', label='SAMME') plt.plot(range(1, n_trees_real + 1),
base_estimator=dt_stump, learning_rate=learning_rate, n_estimators=n_estimators, algorithm="SAMME.R") ada_real.fit(X_train, y_train) fig = plt.figure() ax = fig.add_subplot(111) ax.plot([1, n_estimators], [dt_stump_err] * 2, 'k-', label='Decision Stump Error') ax.plot([1, n_estimators], [dt_err] * 2, 'k--', label='Decision Tree Error') ada_discrete_err = np.zeros((n_estimators,)) for i, y_pred in enumerate(ada_discrete.staged_predict(X_test)): ada_discrete_err[i] = zero_one_loss(y_pred, y_test) ada_discrete_err_train = np.zeros((n_estimators,)) for i, y_pred in enumerate(ada_discrete.staged_predict(X_train)): ada_discrete_err_train[i] = zero_one_loss(y_pred, y_train) ada_real_err = np.zeros((n_estimators,)) for i, y_pred in enumerate(ada_real.staged_predict(X_test)): ada_real_err[i] = zero_one_loss(y_pred, y_test) ada_real_err_train = np.zeros((n_estimators,)) for i, y_pred in enumerate(ada_real.staged_predict(X_train)): ada_real_err_train[i] = zero_one_loss(y_pred, y_train) ax.plot(np.arange(n_estimators) + 1, ada_discrete_err,
plt.legend() plt.xlabel("bumpiness") plt.ylabel("grade") plt.show() ################################################################################ ### your code here! name your classifier object clf if you want the ### visualization code (prettyPicture) to show you the decision boundary from sklearn.ensemble import AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier clf = AdaBoostClassifier(DecisionTreeClassifier()) clf.fit(features_train, labels_train) predict = clf.staged_predict(features_test) result = np.asarray(list(predict)) - np.asarray(labels_test) target_hit = 0 for item in result: number_of_hits = 0 for num in item: if num == 0: number_of_hits += 1 if number_of_hits == len(item) - 1: print("Found.") target_hit += 1 accuracy = float(target_hit) / len(result) print(str(accuracy)) try:
def test_sparse_classification(): # Check classification with sparse input. class CustomSVC(SVC): """SVC variant that records the nature of the training set.""" def fit(self, X, y, sample_weight=None): """Modification on fit caries data type for later verification.""" super(CustomSVC, self).fit(X, y, sample_weight=sample_weight) self.data_type_ = type(X) return self X, y = datasets.make_multilabel_classification(n_classes=1, n_samples=15, n_features=5, random_state=42) # Flatten y to a 1d array y = np.ravel(y) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) for sparse_format in [ csc_matrix, csr_matrix, lil_matrix, coo_matrix, dok_matrix ]: X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) # Trained on sparse format sparse_classifier = AdaBoostClassifier( base_estimator=CustomSVC(probability=True), random_state=1, algorithm="SAMME").fit(X_train_sparse, y_train) # Trained on dense format dense_classifier = AdaBoostClassifier( base_estimator=CustomSVC(probability=True), random_state=1, algorithm="SAMME").fit(X_train, y_train) # predict sparse_results = sparse_classifier.predict(X_test_sparse) dense_results = dense_classifier.predict(X_test) assert_array_equal(sparse_results, dense_results) # decision_function sparse_results = sparse_classifier.decision_function(X_test_sparse) dense_results = dense_classifier.decision_function(X_test) assert_array_equal(sparse_results, dense_results) # predict_log_proba sparse_results = sparse_classifier.predict_log_proba(X_test_sparse) dense_results = dense_classifier.predict_log_proba(X_test) assert_array_equal(sparse_results, dense_results) # predict_proba sparse_results = sparse_classifier.predict_proba(X_test_sparse) dense_results = dense_classifier.predict_proba(X_test) assert_array_equal(sparse_results, dense_results) # score sparse_results = sparse_classifier.score(X_test_sparse, y_test) dense_results = dense_classifier.score(X_test, y_test) assert_array_equal(sparse_results, dense_results) # staged_decision_function sparse_results = sparse_classifier.staged_decision_function( X_test_sparse) dense_results = dense_classifier.staged_decision_function(X_test) for sprase_res, dense_res in zip(sparse_results, dense_results): assert_array_equal(sprase_res, dense_res) # staged_predict sparse_results = sparse_classifier.staged_predict(X_test_sparse) dense_results = dense_classifier.staged_predict(X_test) for sprase_res, dense_res in zip(sparse_results, dense_results): assert_array_equal(sprase_res, dense_res) # staged_predict_proba sparse_results = sparse_classifier.staged_predict_proba(X_test_sparse) dense_results = dense_classifier.staged_predict_proba(X_test) for sprase_res, dense_res in zip(sparse_results, dense_results): assert_array_equal(sprase_res, dense_res) # staged_score sparse_results = sparse_classifier.staged_score(X_test_sparse, y_test) dense_results = dense_classifier.staged_score(X_test, y_test) for sprase_res, dense_res in zip(sparse_results, dense_results): assert_array_equal(sprase_res, dense_res) # Verify sparsity of data is maintained during training types = [i.data_type_ for i in sparse_classifier.estimators_] assert all([(t == csc_matrix or t == csr_matrix) for t in types])
ada_discrete = AdaBoostClassifier( base_estimator=dt_stump, learning_rate=learning_rate, n_estimators=n_estimators, algorithm="SAMME") ada_discrete.fit(X_train, y_train) ada_real = AdaBoostClassifier( base_estimator=dt_stump, learning_rate=learning_rate, n_estimators=n_estimators, algorithm="SAMME.R") ada_real.fit(X_train, y_train) ada_discrete_err = np.zeros((n_estimators,)) for i, y_pred in enumerate(ada_discrete.staged_predict(X_test)): ada_discrete_err[i] = zero_one_loss(y_pred, y_test)/10. ada_discrete_err_ave += ada_discrete_err ada_discrete_err_train = np.zeros((n_estimators,)) for i, y_pred in enumerate(ada_discrete.staged_predict(X_train)): ada_discrete_err_train[i] = zero_one_loss(y_pred, y_train)/10. ada_discrete_err_train_ave += ada_discrete_err_train ada_real_err = np.zeros((n_estimators,)) for i, y_pred in enumerate(ada_real.staged_predict(X_test)): ada_real_err[i] = zero_one_loss(y_pred, y_test)/10. ada_real_err_ave += ada_real_err ada_real_err_train = np.zeros((n_estimators,)) for i, y_pred in enumerate(ada_real.staged_predict(X_train)):
def test_sparse_classification(): # Check classification with sparse input. class CustomSVC(SVC): """SVC variant that records the nature of the training set.""" def fit(self, X, y, sample_weight=None): """Modification on fit caries data type for later verification.""" super(CustomSVC, self).fit(X, y, sample_weight=sample_weight) self.data_type_ = type(X) return self X, y = datasets.make_multilabel_classification(n_classes=1, n_samples=15, n_features=5, random_state=42) # Flatten y to a 1d array y = np.ravel(y) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix, dok_matrix]: X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) # Trained on sparse format sparse_classifier = AdaBoostClassifier( base_estimator=CustomSVC(probability=True), random_state=1, algorithm="SAMME" ).fit(X_train_sparse, y_train) # Trained on dense format dense_classifier = AdaBoostClassifier( base_estimator=CustomSVC(probability=True), random_state=1, algorithm="SAMME" ).fit(X_train, y_train) # predict sparse_results = sparse_classifier.predict(X_test_sparse) dense_results = dense_classifier.predict(X_test) assert_array_equal(sparse_results, dense_results) # decision_function sparse_results = sparse_classifier.decision_function(X_test_sparse) dense_results = dense_classifier.decision_function(X_test) assert_array_equal(sparse_results, dense_results) # predict_log_proba sparse_results = sparse_classifier.predict_log_proba(X_test_sparse) dense_results = dense_classifier.predict_log_proba(X_test) assert_array_equal(sparse_results, dense_results) # predict_proba sparse_results = sparse_classifier.predict_proba(X_test_sparse) dense_results = dense_classifier.predict_proba(X_test) assert_array_equal(sparse_results, dense_results) # score sparse_results = sparse_classifier.score(X_test_sparse, y_test) dense_results = dense_classifier.score(X_test, y_test) assert_array_equal(sparse_results, dense_results) # staged_decision_function sparse_results = sparse_classifier.staged_decision_function( X_test_sparse) dense_results = dense_classifier.staged_decision_function(X_test) for sprase_res, dense_res in zip(sparse_results, dense_results): assert_array_equal(sprase_res, dense_res) # staged_predict sparse_results = sparse_classifier.staged_predict(X_test_sparse) dense_results = dense_classifier.staged_predict(X_test) for sprase_res, dense_res in zip(sparse_results, dense_results): assert_array_equal(sprase_res, dense_res) # staged_predict_proba sparse_results = sparse_classifier.staged_predict_proba(X_test_sparse) dense_results = dense_classifier.staged_predict_proba(X_test) for sprase_res, dense_res in zip(sparse_results, dense_results): assert_array_equal(sprase_res, dense_res) # staged_score sparse_results = sparse_classifier.staged_score(X_test_sparse, y_test) dense_results = dense_classifier.staged_score(X_test, y_test) for sprase_res, dense_res in zip(sparse_results, dense_results): assert_array_equal(sprase_res, dense_res) # Verify sparsity of data is maintained during training types = [i.data_type_ for i in sparse_classifier.estimators_] assert all([(t == csc_matrix or t == csr_matrix) for t in types])
for w in [DT1, DT2, DT4]: # Weak classifier w.fit(X_train, y_train) err = 1.0 - w.score(X_train, y_train) ax.plot([1, n_estimators], [err] * 2, styles[j], label="Decision Tree, max depth %d (DT%d)" % (depths[j], depths[j])) # AdaBoost classifier ada = AdaBoostClassifier(base_estimator=w, n_estimators=n_estimators, random_state=0) ada_train_err = np.zeros((n_estimators, )) ada.fit(X_train, y_train) for i, y_pred in enumerate(ada.staged_predict(X_train)): ada_train_err[i] = zero_one_loss(y_pred, y_train) smoothed = [] # use moving average filter to smooth plots -- done to make easier # to see trends; you are encouraged to also plot 'ada_train_err' to # see the actual error plots!! for i in range(len(ada_train_err)): temp = 0. counter = 0. for k in range(i - 5, i + 1): if k >= 0: temp += ada_train_err[k] counter += 1. smoothed.append(temp / counter)
label='Class %s' % n, alpha=.5, edgecolor='k') x1, x2, y1, y2 = plt.axis() plt.axis((x1, x2, y1, y2 * 1.2)) plt.legend(loc='upper right') plt.ylabel('Samples') plt.xlabel('Score') plt.title('Decision Scores') plt.tight_layout() plt.subplots_adjust(wspace=0.35) plt.show() ada_discrete_err_train = np.zeros((2000, )) for i, y_pred in enumerate(bdt.staged_predict(X)): ada_discrete_err_train[i] = zero_one_loss(y, y_pred) ada_discrete_err_train_hinge = np.zeros((2000, )) for i, y_pred in enumerate(bdt.staged_predict(X)): ada_discrete_err_train_hinge[i] = mean_squared_error(y, y_pred) fig = plt.figure(2) ax = fig.add_subplot(111) ax.plot(np.arange(2000) + 1, ada_discrete_err_train_hinge, label='AdaBoost Train Error MSE', color='red') ax.plot(np.arange(2000) + 1, ada_discrete_err_train,
X = np.genfromtxt("Data/train_mice2000PCA.csv", delimiter=",") print(" read X ") Y = np.genfromtxt("Data/train_Y.csv", delimiter=",", dtype='int32') test = np.genfromtxt("Data/test_mice2000PCA.csv", delimiter=",") test_Y = np.zeros((test.shape[0], 1)) bdt_discrete = AdaBoostClassifier(DecisionTreeClassifier(max_depth=7), n_estimators=20, learning_rate=1.5, algorithm="SAMME") print("fitting ") bdt_discrete.fit(X, Y) print("score :", bdt_discrete.score(X, Y)) filename = 'adaboost_model.sav' pickle.dump(bdt_discrete, open(filename, 'wb')) pred = bdt_discrete.staged_predict(test) pred = pred.reshape(-1, 1) pred = pred.reshape((-1, 1)) pred = pred.astype(np.int64) idx = np.arange(test.shape[0]).reshape((-1, 1)) idx = idx.astype(np.int64) output = np.concatenate((idx, pred), axis=1) np.savetxt("results/adaboost_2000ftrs.csv", output.astype(int), fmt='%i', delimiter=",")
n_split = 109296 X_train, X_test = X[:n_split], X[n_split:] y_train, y_test = y[:n_split], y[n_split:] bdt_discrete = AdaBoostClassifier(DecisionTreeClassifier(max_depth=None), n_estimators=50, learning_rate=1, algorithm="SAMME") bdt_discrete.fit(X_train, y_train) discrete_test_errors = [] discrete_test_accuracy = [] for discrete_train_predict in bdt_discrete.staged_predict(X_test): discrete_test_accuracy.append( accuracy_score(discrete_train_predict, y_test)) discrete_test_errors.append(1. - accuracy_score(discrete_train_predict, y_test)) n_trees_discrete = len(bdt_discrete) discrete_estimator_errors = bdt_discrete.estimator_errors_[:n_trees_discrete] discrete_estimator_weights = bdt_discrete.estimator_weights_[:n_trees_discrete] print("The estimator error of 50 stages are : ", discrete_estimator_errors) print("The estimator weights of 50 stages are : ", discrete_estimator_weights) print("The accuracy scores of 50 stages are : ", discrete_test_accuracy) n_trees_discrete = len(bdt_discrete)
def run_main(): tournament_season = 2003 summary_data = read_summary_team_data() teams = read_team_meta_data() tourney_data = read_tournament_results(tournament_season) game_data = utils.compute_game_data(tourney_data, teams) computer_rankings = pd.read_csv(Path('../data/massey_seasons_with_id.csv')) computer_rankings = computer_rankings[ computer_rankings['season'] >= tournament_season] tourney_data = recode_tourney_data(tourney_data) tourney_data = merge_tourney_summary_data(tourney_data, summary_data) tourney_data = join_tourney_team_data(tourney_data, teams) tourney_comp_ratings = merge_tourney_ranking_data(tourney_data, computer_rankings) tourney_comp_ratings = utils.implement_top_conference_feature( tourney_data, teams, game_data, tourney_comp_ratings) tourney_comp_ratings = utils.implement_seed_threshold_feature( tourney_comp_ratings) tourney_comp_ratings = compute_delta_features(tourney_comp_ratings) feature_data = tourney_comp_ratings.drop(columns=[ 'round', 'game_date', 'seed_t', 'team_t', 'team_id_t', 'team_id_o', 'team_o', 'seed_o', 'team_id_o', 'game_result', 'start_season', 'game result', 'conf_name_t', 'conf_name_o' ]).copy() feature_data.drop(columns=[ 'pts_avg_t', 'pts_avg_o', 'opp_pts_avg_t', 'opp_pts_avg_o', 'margin_victory_avg_t', 'margin_victory_avg_o', 'poss_avg_t', 'poss_avg_o', 'fg_pct_t', 'fg_pct_o', 'off_rebs_avg_t', 'off_rebs_avg_o', 'def_rebs_avg_t', 'def_rebs_avg_o', 'ft_pct_t', 'ft_pct_o', 'to_avg_t', 'to_avg_o', 'steal_avg_t', 'steal_avg_o', 'to_net_avg_t', 'to_net_avg_o', 'win_pct_t', 'win_pct_o', 'off_rating_t', 'off_rating_o', 'ft_att_avg_t', 'ft_att_avg_o', 'opp_pts_avg_t', 'opp_pts_avg_o', 'srs_t', 'srs_o', 'sos_t', 'sos_o', 'sag_t', 'sag_o', 'wlk_t', 'wlk_o', 'wol_t', 'wol_o', 'rth_t', 'rth_o', 'col_t', 'col_o', 'pom_t', 'pom_o', 'dol_t', 'dol_o', 'rpi_t', 'rpi_o', 'mor_t', 'mor_o' ], inplace=True) # for now drop the delta seed features feature_data.drop(columns=['upset_seed_threshold'], inplace=True) X = feature_data[feature_data['season_t'] >= tournament_season] y = tourney_comp_ratings[ tourney_comp_ratings['season_t'] >= tournament_season]['game_result'] X = X.drop(columns=['season_t']) feature_list = list(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5) number_estimators = 501 bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME.R", n_estimators=number_estimators, learning_rate=1) bdt.fit(X_train, y_train) score = bdt.score(X_train, y_train) print("Training Model Score= ", score) y_pred = bdt.predict(X_test) print("AdaBoost model accuracy is %2.2f" % metrics.accuracy_score(y_test, y_pred)) prediction_probabilities = bdt.predict_proba(X_test) win_probabilities = pd.Series(prediction_probabilities[:, 1], index=X_test.index) predictions = pd.Series(y_pred, index=y_test.index) test_games = tourney_comp_ratings[tourney_comp_ratings.index.isin( X_test.index)].copy() test_games['predicted_result'] = predictions test_games['pred_win_prob'] = win_probabilities missed_predictions = test_games[ test_games['game_result'] != test_games['predicted_result']].sort_values(by='pred_win_prob', ascending=False) print("Missed predictions= ", missed_predictions.shape[0]) feature_dictionary = utils.Feature_Dictionary() missed_predictions.apply(lambda x: feature_dictionary.print_game_info( test_games, x['season_t'], x['round'], x['team_t']), axis=1) supporting_features = missed_predictions.apply( lambda row: utils.get_supporting_features(row, feature_dictionary, feature_list), axis=1) missed_predictions = missed_predictions.merge( supporting_features.to_frame(name='supporting_features'), how='left', left_index=True, right_index=True) missed_predictions['features'] = 100 * missed_predictions[ 'supporting_features'].apply(lambda x: len(x)) / len(feature_list) missed_predictions['game_index'] = missed_predictions.index plot_missed_predictions_df = missed_predictions[['game_index', 'features']] plot_missed_predictions_df = pd.melt( plot_missed_predictions_df, id_vars='game_index', var_name='Features Supporting Outcome') # plot_missed_predictions_df.head() # m_plot = sns.barplot(x='game_index', y='value', hue='Features Supporting Outcome', # data=plot_missed_predictions_df) # plt.title("Percentage Of Features Consistent With Incorrectly Predicted Game Outcomes") # plt.ylabel('Percentage') # plt.xlabel('Game Index') # m_plot.figure.set_size_inches(20, 6) print("Missed Predictions with greater than 50% feature support") print(plot_missed_predictions_df[plot_missed_predictions_df['value'] > 50]) # analyze game index 246 game_index = 246 print(missed_predictions.loc[game_index]) missed_game = X.loc[game_index].to_frame().T staged_predictions = bdt.staged_predict(missed_game) class_team_votes = 0 class_opp_votes = 0 label_dict = {} estimator_stubs = [] for stub_estimator in bdt.estimators_: stub_tree = stub_estimator.tree_ stub_feature_index = stub_tree.feature[0] stub_feature = missed_game.columns[stub_feature_index] if stub_feature in label_dict: label_dict[stub_feature] += 1 else: label_dict[stub_feature] = 1 threshold_value = stub_tree.threshold[0] test_value = missed_game.iloc[0, stub_feature_index] left_child_node = stub_tree.children_left[0] left_values = stub_tree.value[left_child_node][0] right_child_node = stub_tree.children_right[0] right_values = stub_tree.value[right_child_node][0] node_samples = stub_tree.n_node_samples test_string = "Test: {0} ({1:6.3f}) <= {2:6.3f}".format( stub_feature, test_value, threshold_value) left_string = "Left: Samples= {0}, Values= {1:5.3f}, {2:5.3f}".format( node_samples[left_child_node], left_values[0], left_values[1]) right_string = "Right: Samples= {0}, Values= {1:5.3f}, {2:5.3f}".format( node_samples[right_child_node], right_values[0], right_values[1]) if test_value <= threshold_value: # choose left node if left_values[0] >= left_values[1]: result_string = "Result: Left --> Choose Class -1 --> Opp Team Wins" class_opp_votes += 1 else: result_string = "Result: Left --> Choose Class 1 --> Team Wins" class_team_votes += 1 else: # choose right node if right_values[0] >= right_values[1]: result_string = "Result: Right --> Choose Class -1 --> Opp Team Wins" class_opp_votes += 1 else: result_string = "Result: Right --> Choose Class 1 --> Team Wins" class_team_votes += 1 if next(staged_predictions) == 1: staged_prediction_string = 'Stage Prediction= Class 1' else: staged_prediction_string = 'Stage Prediction= Class -1' stub_dict = { 'test_string': test_string, 'left_string': left_string, 'right_string': right_string, 'result_string': result_string, 'staged_prediction_string': staged_prediction_string } estimator_stubs.append(stub_dict) item_count = 0 for item in estimator_stubs: print("Estimator: ", item_count) print(item['test_string']) print(item['left_string']) print(item['right_string']) print(item['result_string']) print(item['staged_prediction_string']) print("-----------") item_count += 1 print("Class Team Votes= ", class_team_votes) print("Class Opp Votes= ", class_opp_votes) print() print("Number of features in tree stumps= ", len(label_dict)) for key, value in label_dict.items(): print('Feature: ', key, ' Count= ', value) return
# Adaboost with tree method with out PCA #import warnings #warnings.filterwarnings("ignore") from imblearn.over_sampling import SMOTE # doctest: +NORMALIZE_WHITESPACE from collections import Counter from sklearn.ensemble import AdaBoostClassifier from sklearn.metrics import f1_score from sklearn.metrics import confusion_matrix clf_no_pca = AdaBoostClassifier(n_estimators=1000, random_state=1) clf_no_pca.fit(model_tr1, y_train) real_val_macro1 = [0] train_macro1 = [] for real_test_predict in clf_no_pca.staged_predict(model_val1): if f1_score(y_val, real_test_predict, average='macro')> np.max(real_val_macro1) : pred_opt1 = real_test_predict real_val_macro1.append( f1_score(y_val, real_test_predict, average='macro')) for real_train_predict in clf_no_pca.staged_predict(model_tr1): train_macro1.append(f1_score(y_train, real_train_predict, average='macro')) # In[22]: plt.figure() plt.plot(range(1, n_trees_real + 1), real_val_macro1[1:], label='val')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=SEED) # clf = AdaBoostM1(n_estimators=100) clf = AdaBoostClassifier(n_estimators=400, learning_rate=1) clf.fit(X_train, y_train) y_pred = clf.predict(X_train) print(f"Final training error rate:{ error_rate(y_train, y_pred)}") y_pred = clf.predict(X_test) print(f"Final test error rate:{ error_rate(y_test, y_pred)}") training_errors = [] test_errors = [] for y_pred in clf.staged_predict(X_train): training_errors.append(error_rate(y_train, y_pred)) for y_pred in clf.staged_predict(X_test): test_errors.append(error_rate(y_test, y_pred)) fig1, ax = plt.subplots() ax.plot(training_errors, c="tab:blue", label="Training error") ax.plot(test_errors, c="tab:red", label="Test error") plt.legend() plt.ylabel("Misclassification error rate") plt.xlabel("Boosting iteration") plt.savefig("test_error.svg") plt.close()
from classifierWithFS import preLoadData, getBestMeanFeatures, selectFeatures X_train, y_train_phq8, X_test, y_dev, y_train, y_test = preLoadData() maxFeature, featureChoices = getBestMeanFeatures() X_train, X_test, chosenFeatures, numOfFeatures = selectFeatures( maxFeature, featureChoices, X_train, X_test) bdt_real = AdaBoostClassifier(random_state=13370) bdt_real.fit(X_train, y_train) real_test_errors = [] for real_test_predict in bdt_real.staged_predict(X_test): real_test_errors.append(1. - accuracy_score(real_test_predict, y_test)) n_trees_real = len(bdt_real) # Boosting might terminate early, but the following arrays are always # n_estimators long. We crop them to the actual number of trees here: real_estimator_errors = bdt_real.estimator_errors_[:n_trees_real] real_estimator_weights = bdt_real.estimator_weights_[:n_trees_real] #plt.figure(figsize=(15, 5)) #plt.subplot(131) plt.plot(range(1, n_trees_real + 1), real_test_errors, c='black',
y_train = np.append(os, zs) print "training" base_ada.fit(X=X_train, y=y_train) os = np.ones(len(bkgtest)) zs = np.zeros(len(sigtest)) print "adding samples together" X_test = pandas.concat([sigtest, bkgtest]) y_test = np.append(os, zs) sigoutput = base_ada.decision_function(X=sigtest) bkgoutput = base_ada.decision_function(X=bkgtest) from sklearn.metrics import accuracy_score test_errors = [] for te in base_ada.staged_predict(X_test): test_errors.append(1.- accuracy_score(te, y_test)) ntrees = len(test_errors) estimator_errors = base_ada.estimator_errors_[:ntrees] estimator_weights = base_ada.estimator_weights_[:ntrees] from matplotlib.ticker import LinearLocator with PdfPages("bdtplots.pdf") as pdf: xs, xe, ys, ye = get_hist(bkgoutput) plt.errorbar(xs, ys, xerr=xe, yerr=ye, color='red', fmt='.', label='bkg') xs, xe, ys, ye = get_hist(sigoutput) plt.errorbar(xs, ys, xerr=xe, yerr=ye, color='blue', fmt='.',