예제 #1
0
def test_staged_predict():
    """Check staged predictions."""
    # AdaBoost classification
    for alg in ['SAMME', 'SAMME.R']:
        clf = AdaBoostClassifier(algorithm=alg, n_estimators=10)
        clf.fit(iris.data, iris.target)

        predictions = clf.predict(iris.data)
        staged_predictions = [p for p in clf.staged_predict(iris.data)]
        proba = clf.predict_proba(iris.data)
        staged_probas = [p for p in clf.staged_predict_proba(iris.data)]
        score = clf.score(iris.data, iris.target)
        staged_scores = [s for s in clf.staged_score(iris.data, iris.target)]

        assert_equal(len(staged_predictions), 10)
        assert_array_almost_equal(predictions, staged_predictions[-1])
        assert_equal(len(staged_probas), 10)
        assert_array_almost_equal(proba, staged_probas[-1])
        assert_equal(len(staged_scores), 10)
        assert_array_almost_equal(score, staged_scores[-1])

    # AdaBoost regression
    clf = AdaBoostRegressor(n_estimators=10, random_state=0)
    clf.fit(boston.data, boston.target)

    predictions = clf.predict(boston.data)
    staged_predictions = [p for p in clf.staged_predict(boston.data)]
    score = clf.score(boston.data, boston.target)
    staged_scores = [s for s in clf.staged_score(boston.data, boston.target)]

    assert_equal(len(staged_predictions), 10)
    assert_array_almost_equal(predictions, staged_predictions[-1])
    assert_equal(len(staged_scores), 10)
    assert_array_almost_equal(score, staged_scores[-1])
def initialize(tree_depth=2, sample_size=300, sample_noise=0.2):

    # create dataset
    X, y = make_moons(n_samples=sample_size, noise=sample_noise)

    # fit classifier
    adaboost = AdaBoostClassifier(
        n_estimators=n_iterations,
        base_estimator=DecisionTreeClassifier(max_depth=tree_depth))
    adaboost.fit(X, y)

    # get sample weights
    staged_classification = np.array(list(adaboost.staged_predict(X)))
    staged_missclassified = staged_classification != y
    staged_sample_weights = np.ones(shape=(n_iterations + 1, len(X))) / len(X)
    for istage in range(1, n_iterations + 1):
        estimator_weight = adaboost.estimator_weights_[istage - 1]
        sample_weight = staged_sample_weights[istage - 1].copy()
        incorrect = staged_missclassified[istage - 1]
        ############ code snippets from sklearn AdaboostClassifier source ############
        # Only boost positive weights
        sample_weight *= np.exp(estimator_weight * incorrect *
                                ((sample_weight > 0) | (estimator_weight < 0)))
        ##############################################################################
        sample_weight /= np.sum(sample_weight)
        staged_sample_weights[istage] = sample_weight

    # prepare to plot decision boundary
    h = .1
    xrange = np.max(X[:, 0]) - np.min(X[:, 0])
    yrange = np.max(X[:, 1]) - np.min(X[:, 1])
    xs = np.arange(
        np.min(X[:, 0]) - xrange * 0.1,
        np.max(X[:, 0]) + xrange * 0.1, h)
    ys = np.arange(
        np.min(X[:, 1]) - yrange * 0.1,
        np.max(X[:, 1]) + yrange * 0.1, h)
    xx, yy = np.meshgrid(xs, ys)
    staged_zz = np.array(
        list(adaboost.staged_predict(np.c_[xx.ravel(), yy.ravel()])))
    staged_zz = staged_zz.reshape(len(staged_zz), xx.shape[0], xx.shape[1])

    # get estimators in the ensemble
    estimators = adaboost.estimators_
    single_zz = np.zeros(shape=(len(estimators), xx.shape[0], xx.shape[1]))
    for iiter in range(1, n_iterations):
        next_estimator = estimators[iiter]
        next_zz = next_estimator.predict(np.c_[xx.ravel(), yy.ravel()])
        next_zz = next_zz.reshape(xx.shape)
        single_zz[iiter] = next_zz

    globalvars = {}
    globalvars['X'] = X
    globalvars['y'] = y
    globalvars['staged_sample_weights'] = staged_sample_weights
    globalvars['xs'] = xs
    globalvars['ys'] = ys
    globalvars['staged_zz'] = staged_zz
    globalvars['single_zz'] = single_zz
    return globalvars
예제 #3
0
def test_staged_predict():
    """Check staged predictions."""
    # AdaBoost classification
    for alg in ['SAMME', 'SAMME.R']:
        clf = AdaBoostClassifier(algorithm=alg, n_estimators=10)
        clf.fit(iris.data, iris.target)

        predictions = clf.predict(iris.data)
        staged_predictions = [p for p in clf.staged_predict(iris.data)]
        proba = clf.predict_proba(iris.data)
        staged_probas = [p for p in clf.staged_predict_proba(iris.data)]
        score = clf.score(iris.data, iris.target)
        staged_scores = [s for s in clf.staged_score(iris.data, iris.target)]

        assert_equal(len(staged_predictions), 10)
        assert_array_almost_equal(predictions, staged_predictions[-1])
        assert_equal(len(staged_probas), 10)
        assert_array_almost_equal(proba, staged_probas[-1])
        assert_equal(len(staged_scores), 10)
        assert_array_almost_equal(score, staged_scores[-1])

    # AdaBoost regression
    clf = AdaBoostRegressor(n_estimators=10)
    clf.fit(boston.data, boston.target)

    predictions = clf.predict(boston.data)
    staged_predictions = [p for p in clf.staged_predict(boston.data)]
    score = clf.score(boston.data, boston.target)
    staged_scores = [s for s in clf.staged_score(boston.data, boston.target)]

    assert_equal(len(staged_predictions), 10)
    assert_array_almost_equal(predictions, staged_predictions[-1])
    assert_equal(len(staged_scores), 10)
    assert_array_almost_equal(score, staged_scores[-1])
예제 #4
0
def some(X, Y, X_test, Y_test):
    ada = AdaBoostClassifier()
    print "Train Model ---"
    t1 = time()
    ada.fit(X, Y)
    t2 = time()
    print "Model Trained ----------", t2 - t1
    test_errors = []
    cur = 1
    Y_test2 = []
    for k in Y_test:
        Y_test2.append(k[0])
    print "Testing: "
    print  Y_test2
    pred =  ada.predict(X_test)
    print pred
    accu =  1. - accuracy_score(y_true= Y_test2, y_pred= pred)
    print accu
    print "STAGED _____________"
    for test_predict in (
        ada.staged_predict(X_test)):


            test_errors.append(
            1. - accuracy_score(test_predict, Y_test2))


    print  "errorss : "
    print test_errors
예제 #5
0
def test_staged_predict(algorithm):
    # Check staged predictions.
    rng = np.random.RandomState(0)
    iris_weights = rng.randint(10, size=iris.target.shape)
    diabetes_weights = rng.randint(10, size=diabetes.target.shape)

    clf = AdaBoostClassifier(algorithm=algorithm, n_estimators=10)
    clf.fit(iris.data, iris.target, sample_weight=iris_weights)

    predictions = clf.predict(iris.data)
    staged_predictions = [p for p in clf.staged_predict(iris.data)]
    proba = clf.predict_proba(iris.data)
    staged_probas = [p for p in clf.staged_predict_proba(iris.data)]
    score = clf.score(iris.data, iris.target, sample_weight=iris_weights)
    staged_scores = [
        s for s in clf.staged_score(
            iris.data, iris.target, sample_weight=iris_weights)
    ]

    assert len(staged_predictions) == 10
    assert_array_almost_equal(predictions, staged_predictions[-1])
    assert len(staged_probas) == 10
    assert_array_almost_equal(proba, staged_probas[-1])
    assert len(staged_scores) == 10
    assert_array_almost_equal(score, staged_scores[-1])

    # AdaBoost regression
    clf = AdaBoostRegressor(n_estimators=10, random_state=0)
    clf.fit(diabetes.data, diabetes.target, sample_weight=diabetes_weights)

    predictions = clf.predict(diabetes.data)
    staged_predictions = [p for p in clf.staged_predict(diabetes.data)]
    score = clf.score(diabetes.data,
                      diabetes.target,
                      sample_weight=diabetes_weights)
    staged_scores = [
        s for s in clf.staged_score(
            diabetes.data, diabetes.target, sample_weight=diabetes_weights)
    ]

    assert len(staged_predictions) == 10
    assert_array_almost_equal(predictions, staged_predictions[-1])
    assert len(staged_scores) == 10
    assert_array_almost_equal(score, staged_scores[-1])
예제 #6
0
def test_staged_predict():
    # Check staged predictions.
    rng = np.random.RandomState(0)
    iris_weights = rng.randint(10, size=iris.target.shape)
    boston_weights = rng.randint(10, size=boston.target.shape)

    # AdaBoost classification
    for alg in ['SAMME', 'SAMME.R']:
        clf = AdaBoostClassifier(algorithm=alg, n_estimators=10)
        clf.fit(iris.data, iris.target, sample_weight=iris_weights)

        predictions = clf.predict(iris.data)
        staged_predictions = [p for p in clf.staged_predict(iris.data)]
        proba = clf.predict_proba(iris.data)
        staged_probas = [p for p in clf.staged_predict_proba(iris.data)]
        score = clf.score(iris.data, iris.target, sample_weight=iris_weights)
        staged_scores = [
            s for s in clf.staged_score(
                iris.data, iris.target, sample_weight=iris_weights)
        ]

        assert_equal(len(staged_predictions), 10)
        assert_array_almost_equal(predictions, staged_predictions[-1])
        assert_equal(len(staged_probas), 10)
        assert_array_almost_equal(proba, staged_probas[-1])
        assert_equal(len(staged_scores), 10)
        assert_array_almost_equal(score, staged_scores[-1])

    # AdaBoost regression
    clf = AdaBoostRegressor(n_estimators=10, random_state=0)
    clf.fit(boston.data, boston.target, sample_weight=boston_weights)

    predictions = clf.predict(boston.data)
    staged_predictions = [p for p in clf.staged_predict(boston.data)]
    score = clf.score(boston.data, boston.target, sample_weight=boston_weights)
    staged_scores = [
        s for s in clf.staged_score(
            boston.data, boston.target, sample_weight=boston_weights)
    ]

    assert_equal(len(staged_predictions), 10)
    assert_array_almost_equal(predictions, staged_predictions[-1])
    assert_equal(len(staged_scores), 10)
    assert_array_almost_equal(score, staged_scores[-1])
def test_staged_predict():
    # Check staged predictions.
    rng = np.random.RandomState(0)
    iris_weights = rng.randint(10, size=iris.target.shape)
    boston_weights = rng.randint(10, size=boston.target.shape)

    # AdaBoost classification
    for alg in ['SAMME', 'SAMME.R']:
        clf = AdaBoostClassifier(algorithm=alg, n_estimators=10)
        clf.fit(iris.data, iris.target, sample_weight=iris_weights)

        predictions = clf.predict(iris.data)
        staged_predictions = [p for p in clf.staged_predict(iris.data)]
        proba = clf.predict_proba(iris.data)
        staged_probas = [p for p in clf.staged_predict_proba(iris.data)]
        score = clf.score(iris.data, iris.target, sample_weight=iris_weights)
        staged_scores = [
            s for s in clf.staged_score(
                iris.data, iris.target, sample_weight=iris_weights)]

        assert_equal(len(staged_predictions), 10)
        assert_array_almost_equal(predictions, staged_predictions[-1])
        assert_equal(len(staged_probas), 10)
        assert_array_almost_equal(proba, staged_probas[-1])
        assert_equal(len(staged_scores), 10)
        assert_array_almost_equal(score, staged_scores[-1])

    # AdaBoost regression
    clf = AdaBoostRegressor(n_estimators=10, random_state=0)
    clf.fit(boston.data, boston.target, sample_weight=boston_weights)

    predictions = clf.predict(boston.data)
    staged_predictions = [p for p in clf.staged_predict(boston.data)]
    score = clf.score(boston.data, boston.target, sample_weight=boston_weights)
    staged_scores = [
        s for s in clf.staged_score(
            boston.data, boston.target, sample_weight=boston_weights)]

    assert_equal(len(staged_predictions), 10)
    assert_array_almost_equal(predictions, staged_predictions[-1])
    assert_equal(len(staged_scores), 10)
    assert_array_almost_equal(score, staged_scores[-1])
예제 #8
0
    def adaBoostKNN(self, n_estimators, n_neighbors):
        #adaBoost with decision tree
        X_train = self.X_train
        y_train = self.y_train
        X_test = self.X_test
        y_test = self.y_test
        real_test_errors = []
        clf = KNeighborsClassifier(n_neighbors=n_neighbors)
        clf.sample_weight = np.ones(len(y_train))/len(y_train)
        bdt_real = AdaBoostClassifier(clf, n_estimators=n_estimators, learning_rate=1)

        bdt_real.fit(X_train, y_train)
        min_error = 1.0
        count = 0
        for real_test_predict in bdt_real.staged_predict(X_test):
            error = 1.0 - accuracy_score(real_test_predict, y_test)
            if error < min_error:
                min_error = error
                min_error_pred = real_test_predict
                min_error_n = count
            real_test_errors.append(error)
            count += 1
        n_trees_real = len(bdt_real)
        real_estimator_errors = bdt_real.estimator_errors_[:n_trees_real]

        plt.figure(figsize=(15, 5))

        plt.subplot(1, 2, 1)
        plt.plot(range(1, n_trees_real + 1),
                 real_test_errors, c='black',
                 linestyle='dashed', label='SAMME.R')
        plt.legend()
        # plt.ylim(0.18, 0.62)
        plt.ylabel('Test Error')
        plt.xlabel('Number of Trees')

        plt.subplot(1, 2, 2)
        plt.plot(range(1, n_trees_real + 1), real_estimator_errors,
                 "r", label='SAMME.R', alpha=.5)
        plt.legend()
        plt.ylabel('Error')
        plt.xlabel('Number of Trees')
        plt.ylim((.2, real_estimator_errors.max() * 1.2))
        plt.xlim((-20, len(bdt_real) + 20))

        # prevent overlapping y-axis y_train
        plt.subplots_adjust(wspace=0.25)
        plt.show()

        return min_error_pred, min_error_n
예제 #9
0
def solution(filename):
    # Считывание данных
    data = pd.read_csv(filename + '.csv')
    # data['class'] = pd.factorize(data['class'])[0]
    y = data['class'].values
    x = data.drop(['class'], axis=1).values

    # Разбиение данных на тестовую и тренировочную выборки
    # x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15)

    # Алгоритм бустинга
    if filename == 'chips':
        learning_rate = 0.15
    else:
        learning_rate = 1.15
    model = AdaBoostClassifier(learning_rate=learning_rate, n_estimators=150)
    model.fit(x, y)

    # Построение графика для каждого шага алгоритма
    plt.figure()
    plt.scatter(x[:, 0], x[:, 1], c=colors(y), cmap=plt.cm.Paired)
    ax = plt.gca()
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()
    xx = np.linspace(xlim[0], xlim[1], 30)
    yy = np.linspace(ylim[0], ylim[1], 30)
    YY, XX = np.meshgrid(yy, xx)
    xy = np.vstack([XX.ravel(), YY.ravel()]).T

    y_pred = model.staged_predict(x)
    decision_funcs = model.staged_decision_function(xy)

    i = 1
    for _, func in zip(y_pred, decision_funcs):
        plt.clf()
        plt.scatter(x[:, 0], x[:, 1], c=colors(y), cmap=plt.cm.Paired)
        f = func.reshape(XX.shape)
        ax = plt.gca()
        ax.contour(XX,
                   YY,
                   f,
                   colors='k',
                   levels=[-1, 0, 1],
                   alpha=0.5,
                   linestyles=['--', '-', '--'])
        plt.legend([f'Step number {i}'])
        plt.savefig(filename + '/' + str(i) + '.png')

        i += 1
예제 #10
0
def show():
    # 设置AdaBoost迭代次数
    n_estimators = 200
    # 使用
    X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
    # 从12000个数据中取前2000行作为测试集,其余作为训练集
    train_x, train_y = X[2000:], y[2000:]
    test_x, test_y = X[:2000], y[:2000]
    # 弱分类器
    dt_stump = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1)
    dt_stump.fit(train_x, train_y)
    dt_stump_err = 1.0 - dt_stump.score(test_x, test_y)
    # 决策树分类器
    dt = DecisionTreeClassifier()
    dt.fit(train_x, train_y)
    dt_err = 1.0 - dt.score(test_x, test_y)
    # AdaBoost分类器
    ada = AdaBoostClassifier(base_estimator=dt_stump,
                             n_estimators=n_estimators)
    ada.fit(train_x, train_y)
    # 三个分类器的错误率可视化
    fig = plt.figure()
    # 设置plt正确显示中文
    plt.rcParams['font.sans-serif'] = ['SimHei']
    ax = fig.add_subplot(111)
    ax.plot([1, n_estimators], [dt_stump_err] * 2, 'k-', label=u'决策树弱分类器 错误率')
    ax.plot([1, n_estimators], [dt_err] * 2, 'k--', label=u'决策树模型 错误率')
    ada_err = np.zeros((n_estimators, ))
    # 遍历每次迭代的结果 i为迭代次数, pred_y为预测结果
    for i, pred_y in enumerate(ada.staged_predict(test_x)):
        # 统计错误率
        ada_err[i] = zero_one_loss(pred_y, test_y)
    # 绘制每次迭代的AdaBoost错误率
    ax.plot(np.arange(n_estimators) + 1,
            ada_err,
            label='AdaBoost Test 错误率',
            color='orange')
    ax.set_xlabel('迭代次数')
    ax.set_ylabel('错误率')
    leg = ax.legend(loc='upper right', fancybox=True)
    plt.show()
예제 #11
0
def ensembleProc(n_estimators, learning_rate, trainfile, testfile):
	features = np.genfromtxt(trainfile, delimiter=' ', usecols=(0, 1, 2))
	labels = np.genfromtxt(trainfile, delimiter=' ', usecols=(-1))
	tests = np.genfromtxt(testfile, delimiter=' ', usecols=(0, 1, 2))
	testlabels = np.genfromtxt(testfile, delimiter=' ', usecols=(-1))

	dt_stump = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1)
	dt_stump.fit(features, labels)

	ada_real = AdaBoostClassifier(
		base_estimator=dt_stump,
		learning_rate=learning_rate,
		n_estimators=n_estimators,
		algorithm="SAMME")
	ada_real.fit(features, labels)

	error = np.zeros((n_estimators,))
	for i, predict in enumerate(ada_real.staged_predict(tests)):
		error[i] = zero_one_loss(predict, testlabels)
	
	return np.mean(error)
예제 #12
0
def adaboost():
    X_train, y_train = read('train')
    X_test, y_test = read('test')
    X_train = X_train.reshape((X_train.shape[0], -1))
    X_test = X_test.reshape((X_test.shape[0], -1))

    bdt_discrete = AdaBoostClassifier(DecisionTreeClassifier(
        max_depth=10, min_samples_split=20, min_samples_leaf=5),
                                      n_estimators=500,
                                      learning_rate=0.5,
                                      algorithm='SAMME')

    bdt_discrete.fit(X_train, y_train)

    discrete_test_errors = []

    for discrete_train_predict in bdt_discrete.staged_predict(X_test):
        discrete_test_errors.append(
            1. - accuracy_score(discrete_train_predict, y_test))

    return bdt_discrete, discrete_test_errors
print("adaboost classifier training in %.2f" % (time() - start))

# use cross-validation to estimate accuracy
# start = time()
# train_pred = cross_val_predict(ada_clf, bag_of_words, train.cuisine, cv=2)
# print("adaboost evaluation finished in %.2f" % (time() - start))

# print("Estimated accuracy using cross-validation: " , accuracy_score(train.cuisine, train_pred))


# use rest of labelled training data to check accuracy score (for plotting)
test = pd.read_json("data/train2.json")
test_words = [" ".join(item) for item in test.ingredients]
test_bag = vec.transform(test_words).toarray()
test_errors = []
for test_predict in ada_clf.staged_predict(test_bag):
    test_errors.append(1.0 - accuracy_score(test_predict, test.cuisine))

plt.figure(figsize=(15, 5))

plt.plot(range(1, len(ada_clf) + 1), test_errors)
plt.ylabel("Test Error")
plt.xlabel("Number of Trees")
plt.show()

# Load in Testing Data
test = pd.read_json("data/test.json")

# Create test Bag of Words
test_words = [" ".join(item) for item in test.ingredients]
test_bag = vec.transform(test_words).toarray()
예제 #14
0
    # Now predict the value of the digit on the second half:
    predicted = classifier.predict(X_test)
    r_predicted = r_classifier.predict(X_test)

    print("Classification report for classifier %s:\n%s\n"
          % (classifier, metrics.classification_report(y_test, predicted)))
    print("Confusion matrix:\n%s" % metrics.confusion_matrix(y_test, predicted))

    print("Classification report for classifier %s:\n%s\n"
          % (r_classifier, metrics.classification_report(y_test, r_predicted)))
    print("Confusion matrix:\n%s" % metrics.confusion_matrix(y_test, r_predicted))

    n_trees = xrange(1, len(classifier) + 1)
    test_errors = []
    train_errors = []
    for p in classifier.staged_predict(X_test):
        test_errors.append(1. - accuracy_score(p, y_test))
    for p in classifier.staged_predict(X_train):
        train_errors.append(1. - accuracy_score(p, y_train))

    test_errors_rand = []
    for i in xrange(1, args.estimators + 1):
        print '.',
        r_classifier = RandomForestClassifier(n_estimators=i, n_jobs=args.jobs, max_depth=args.max_depth)
        r_classifier.fit(X_train, y_train)
        r_predicted = r_classifier.predict(X_test)
        test_errors_rand.append(1. - accuracy_score(r_predicted, y_test))
    print '.'

    pl.subplot(1,1,1)
    pl.plot(n_trees, test_errors, c='red', label='AdaBoost.%s' % args.boost)
예제 #15
0
            test_word_arrayLabel.append(label[randomIndex])
            test_word_array.append(train_mood_array[randomIndex])
            del (train_mood_array[randomIndex])
            del (label[randomIndex])
        except Exception as e:
            print(e)
    multi = MultinomialNB()
    multi.fit(train_mood_array, label)
    multi.predict(test_word_array)
    ada_real = AdaBoostClassifier(
        base_estimator=multi,
        learning_rate=learning_rate,
        n_estimators=n_estimators)
    ada_real.fit(train_mood_array, label)
    ada_real_err = np.zeros((n_estimators,))
    for i, y_pred in enumerate(ada_real.staged_predict(test_word_array)):
        ada_real_err[i] = zero_one_loss(y_pred, test_word_arrayLabel)
        print(ada_real_err[i])

    # ROC    start
    X_train = train_mood_array
    X_test = test_word_array
    y_train = label
    y_test = test_word_arrayLabel
    y = label
    y_score = ada_real.predict_proba(test_word_array)
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in [0, 1]:
        fpr[i], tpr[i], _ = roc_curve(test_word_arrayLabel, y_score[:, 0], pos_label=1)
예제 #16
0
def bdtModel(df_sig_train, df_bkg_train, df_sig_test, df_bkg_test):

    # '---------- Prepare Training ----------'

    X_sig = np.array(df_sig_train)
    y_sig = np.array(X_sig.shape[0] * [1])
    X_bkg = np.array(df_bkg_train)
    y_bkg = np.array(X_bkg.shape[0] * [0])

    X = np.concatenate((X_sig, X_bkg))
    y = np.concatenate((y_sig, y_bkg))

    print 'X_sig.shape: ', X_sig.shape
    print 'y_sig.shape: ', y_sig.shape
    print 'X_bkg.shape: ', X_bkg.shape
    print 'y_bkg.shape: ', y_bkg.shape
    print 'X.shape: ', X.shape
    print 'y.shape: ', y.shape

    # '---------- Prepare Testing ----------'

    X_sig_test = np.array(df_sig_test)
    y_sig_test = np.array(X_sig_test.shape[0] * [1])
    X_bkg_test = np.array(df_bkg_test)
    y_bkg_test = np.array(X_bkg_test.shape[0] * [0])

    X_test = np.concatenate((X_sig_test, X_bkg_test))
    y_test = np.concatenate((y_sig_test, y_bkg_test))

    print 'X_sig_test.shape: ', X_sig_test.shape
    print 'y_sig_test.shape: ', y_sig_test.shape
    print 'X_bkg_test.shape: ', X_bkg_test.shape
    print 'y_bkg_test.shape: ', y_bkg_test.shape
    print 'X_test.shape: ', X_test.shape
    print 'y_test.shape: ', y_test.shape


    # '---------- Model ----------'

    #scaler = preprocessing.StandardScaler().fit(X)
    #X = scaler.transform(X)

    #model = svm.SVC(C = 50, kernel = 'rbf', tol=0.001, gamma=0.005, probability=True)
    #model.fit(X, y)

    dt = DecisionTreeClassifier(max_depth=3,
                                min_samples_leaf=0.05*len(X))
    model = AdaBoostClassifier(dt,
                             algorithm='SAMME',
                             n_estimators=400,
                             learning_rate=0.5)
    
    model.fit(X, y)


    print '---------- Training/Testing info ----------'

    print 'Accuracy (training): ', model.score(X, y)
    print 'Null Error Rate (training): ', y.mean()


    #X_test = scaler.transform(X_test)
    predicted_test = model.predict(X_test)

    predicted_test_clever = (predicted_test + y_test).tolist()
    error_test = float(predicted_test_clever.count(1)) / float(len(predicted_test_clever))
    print "Error: ", error_test

    print "Accuracy (testing): ", metrics.accuracy_score(y_test, predicted_test)
    print "Recall (testing): ",   metrics.recall_score(y_test, predicted_test)
    print "F1 score (testing): ", metrics.f1_score(y_test, predicted_test)
    print "ROC area under curve (testing): ", metrics.roc_auc_score(y_test, predicted_test)

    #'PTS','AST','REB','STL','BLK','FG_PCT','FG3_PCT','FT_PCT','MIN','EFF','WL']
    #user_input = scaler.transform(np.array([10, 1, 2, 0, 2, 0.3, 0.3, 0.3, 10, 5, 1], dtype=float))
    #user_input = scaler.transform(np.array([10,1,2,2,2,2,2,2,2,2,1], dtype=float))
    #user_input = scaler.transform(np.array([10,1,2], dtype=float))
    user_input = np.array([10.15, 1.95, 6.77, 1.12, 0.28, 0.51, 0.37, 0.47, 32.5, 14.8, 0.53], dtype=float)

    score = model.decision_function(user_input)
    print 'Score (user input): ', score
    result = model.predict_proba(user_input)
    print 'Probability of 1 (user input): ', result



    # '--------- Visualization -----------'

    Classifier_training_S = model.decision_function(X[y>0.5]).ravel()
    Classifier_training_B = model.decision_function(X[y<0.5]).ravel()
    Classifier_testing_S = model.decision_function(X_test[y_test>0.5]).ravel()
    Classifier_testing_B = model.decision_function(X_test[y_test<0.5]).ravel()

    (h_test_s, h_test_b) =  visualSigBkg("BDT", Classifier_training_S, Classifier_training_B, Classifier_testing_S, Classifier_testing_B)


    # '-------- Variable Importance ---------'
    feature_importance = model.feature_importances_
    # make importances relative to max importance
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    mpl.style.use('ggplot')
    pl.subplot(1, 2, 2)
    pl.barh(pos, feature_importance[sorted_idx], align='center')
    pl.yticks(pos, df_sig_train.columns[sorted_idx])
    pl.xlabel('Relative Importance', fontsize=15)
    pl.title('Variable Importance', fontsize=15)
    #pl.show()
    plt.savefig("Var_importance.pdf")
    plt.close()


    fig = plt.figure()
    ax = fig.add_subplot(111)

    model_err = np.zeros((400,))
    for i, y_pred in enumerate(model.staged_predict(X_test)):
        model_err[i] = zero_one_loss(y_pred, y_test)
    
    model_err_train = np.zeros((400,))
    for i, y_pred in enumerate(model.staged_predict(X)):
        model_err_train[i] = zero_one_loss(y_pred, y)

    ax.plot(np.arange(400) + 1, model_err,
            label='AdaBoost Test Error',
            color='orange')
    ax.plot(np.arange(400) + 1, model_err_train,
            label='AdaBoost Train Error',
            color='green')
    
    ax.set_ylim((0.25, 0.35))
    ax.set_xlabel('Number of Trees')
    ax.set_ylabel('Error Rate')
    
    leg = ax.legend(loc='upper right', fancybox=True)
    leg.get_frame().set_alpha(0.7)

    plt.savefig("ntrees.pdf")
    plt.close()    

    ########################################################### 

    return (model, X, y, result, model.score(X, y), error_test, score, h_test_s, h_test_b)
    dtree = DecisionTreeClassifier(max_depth=1)
    """
                base_estimator=None,  子模型类型
                 n_estimators=50, 子模型个数
                 learning_rate=1., 学习步长,缩放因子
                 algorithm='SAMME.R',  
                 random_state=None):

    """
    algo = AdaBoostClassifier(base_estimator=dtree, n_estimators=10)
    # 模型训练
    algo.fit(X_train, y_train)
    # 模型效果评估
    print('训练集上的准确率:{}'.format(algo.score(X_train, y_train)))
    print('测试集上的准确率:{}'.format(algo.score(X_test, y_test)))

    x_test = [[6.9, 3.1, 5.1, 2.3], [6.1, 2.8, 4.0, 1.3], [5.2, 3.4, 1.4, 0.2]]
    print('样本预测值:')
    print(algo.predict(x_test))
    print("样本的预测概率值:")
    print(algo.predict_proba(x_test))
    print("样本的预测概率值的Log转换值:")
    print(algo.predict_log_proba(x_test))

    print("训练好的所有子模型:\n{}".format(algo.estimators_))
    x_test = [[6.9, 3.1, 5.1, 2.3], [6.1, 2.8, 4.0, 1.3], [5.2, 3.4, 2.9, 0.8]]
    generator = algo.staged_predict(x_test)
    print('阶段预测值:')
    for i in generator:
        print(i)
    print('各特征属性权重列表:{}'.format(algo.feature_importances_))
예제 #18
0
label=label[index]
(X_train,X_test)=(data[0:30000],data[30000:])
(y_train,y_test)=(label[0:30000],label[3000:])
#X_train, X_test = getPics().trainData,getPics().testData
#y_train, y_test = getPics().trainLabel,getPics().testLabel
#print X_train.shape
#print y_train.shape
bdt_discrete = AdaBoostClassifier(
    CnnModel(),
    n_estimators=500,
    learning_rate=0.3,
    algorithm="SAMME")
bdt_discrete.fit(X_train, y_train)
discrete_test_errors = []

for  discrete_train_predict in bdt_discrete.staged_predict(X_test):
    discrete_test_errors.append(
        1. - accuracy_score(discrete_train_predict, y_test))

n_trees_discrete = len(bdt_discrete)
discrete_estimator_errors = bdt_discrete.estimator_errors_[:n_trees_discrete]
discrete_estimator_weights = bdt_discrete.estimator_weights_[:n_trees_discrete]

plt.figure(figsize=(15, 5))

plt.subplot(131)
plt.plot(range(1, n_trees_discrete + 1),
         discrete_test_errors, c='black')
plt.legend()
plt.ylim(0.18, 0.62)
plt.ylabel('Test Error')
예제 #19
0
from sklearn.ensemble import AdaBoostClassifier
from read_data import read_data

def results_from_examples(ps,ls):
    return [1 if p == l else 0 for p,l in zip(ps,ls)]

def error_rate(rs):
    return 1.0-((1.0*sum(rs))/len(rs))

print "Sklearn"
examples,labels = read_data('Data/clean1_clean.data')
clf = AdaBoostClassifier(n_estimators=50)
a = AdaBoostClassifier.fit(clf,examples,labels)
score = a.score(examples, labels)
i = 0
print "Estimator, Ensemble error, Classifier error"
for value in AdaBoostClassifier.staged_predict(clf, examples):
    rs = results_from_examples(value, labels)
    #print "Estimator: " + str(i) + " Ensemble error: " + str(error_rate(rs)) + " Classifier error: " + str(clf.estimator_errors_[i])
    print str(i) + "," + str(error_rate(rs)) + "," + str(clf.estimator_errors_[i])
    i = i + 1

print score
예제 #20
0
targetValues = data["class"].values
attributes = data.drop(["class"], axis=1).values

attributesTrain, attributesTest, targetValuesTrain, targetValueTest = train_test_split(
    attributes, targetValues, test_size=0.1)

bestAccuracy, bestLearningRate, bestIteration = 0, 0, 0

for i in range(0, 100):
    learningRate = 1e-3 * 1.1**i
    classifier = AdaBoostClassifier(learning_rate=learningRate,
                                    n_estimators=100)
    classifier.fit(attributesTrain, targetValuesTrain)

    targetValuePredictedFunctions = classifier.staged_predict(attributes)
    scoreFunctions = classifier.staged_score(attributesTest, targetValueTest)

    for j, item in enumerate(scoreFunctions, start=0):
        if (item > bestAccuracy):
            bestAccuracy = item
            bestLearningRate = learningRate
            bestIteration = j

print(bestLearningRate)
print(bestAccuracy)
print(bestIteration)

classifier = AdaBoostClassifier(learning_rate=bestLearningRate,
                                n_estimators=100)
classifier.fit(attributesTrain, targetValuesTrain)
예제 #21
0
train_x, train_y = X[:2000], y[:2000]
# 弱分类器
dt_stump = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1)
dt_stump.fit(train_x, train_y)
dt_stump_err = 1.0 - dt_stump.score(test_x, test_y)
# 决策树分类器
dt = DecisionTreeClassifier()
dt.fit(train_x, train_y)
dt_err = 1.0 - dt.score(test_x, test_y)
# AdaBoost 分类器
ada = AdaBoostClassifier(base_estimator=dt_stump, n_estimators=n_estimators)
ada.fit(train_x, train_y)
# 三个分类器的错误率可视化
fig = plt.figure()
# 设置 plt 正确显示中文
plt.rcParams['font.sans-serif'] = ['SimHei']
ax = fig.add_subplot(111)
ax.plot([1, n_estimators], [dt_stump_err] * 2, 'k-', label=u'决策树弱分类器 错误率')
ax.plot([1, n_estimators], [dt_err] * 2, 'k--', label=u'决策树模型 错误率')
ada_err = np.zeros((n_estimators,))
# 遍历每次迭代的结果 i 为迭代次数, pred_y 为预测结果
for i, pred_y in enumerate(ada.staged_predict(test_x)):
    # 统计错误率
    ada_err[i] = zero_one_loss(pred_y, test_y)
# 绘制每次迭代的 AdaBoost 错误率
ax.plot(np.arange(n_estimators) + 1, ada_err, label='AdaBoost Test 错误率', color='orange')
ax.set_xlabel('迭代次数')
ax.set_ylabel('错误率')
leg = ax.legend(loc='upper right', fancybox=True)
plt.show()
예제 #22
0
#!/usr/bin/env python
if __name__ == '__main__':
    from sklearn.ensemble import AdaBoostClassifier as ABC
    from sklearn.tree import DecisionTreeClassifier as DTC
    import numpy as np
    from sklearn.metrics import accuracy_score
    from final_utils import read_hwfile

    # initialize data
    dat, lab, nDat = read_hwfile('ml14fall_train_align.dat.hog.dat', 169)
    nVal = nDat/5
    nTrn = nDat-nVal
    datTrn = dat[:nTrn]
    labTrn = lab[:nTrn]
    datVal = dat[-nVal:]
    labVal = lab[-nVal:]
    print "#trn = {}, #val = {}".format(nTrn, nVal)


    classfier = ABC(DTC(max_depth=6, max_features=1), n_estimators=50000)
    classfier.fit(datTrn, labTrn)

    for i, labPre in enumerate(classfier.staged_predict(datVal)):
	if i % 10 == 9:
	    print accuracy_score(labPre, labVal)
예제 #23
0
n_split = 3000

X_train, X_test = X[n_split:], X[:n_split]
Y_train, Y_test = y[n_split:], y[:n_split]

clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),
                         algorithm="SAMME.R",
                         n_estimators=600)

clf.fit(X_train, Y_train)

train_errors = []
test_errors = []

for train_error, test_error in zip(clf.staged_predict(X_train),
                                   clf.staged_predict(X_test)):
    train_errors.append(1 - accuracy_score(train_error, Y_train))
    test_errors.append(1 - accuracy_score(test_error, Y_test))

x = np.linspace(1, 600, 600)
plt.style.use('ggplot')
plt.plot(x, train_errors, 'r', label='SAMME.R Train Error')
plt.plot(x, test_errors, 'b', label='SAMME.R Test Error')
plt.legend(loc='upper right', fancybox=True)
plt.xlabel('Eestimator Numbers')
plt.ylabel('Error Rate')
# plt.show()
plt.savefig('error_rate.png', dpi=250)

# https://www.zybuluo.com/yxd/note/614495
예제 #24
0
    def train_net(self,
                  model_s,
                  batch_size = 128,
                  epochs = 20
                  ):
        n_classes = 2
        
#        self.reduce_data(20)
#
#        # Generate new instances to fix any class imbalance(relevant for (16,) set)
#        sm = SMOTE()
#        self.X, self.labels = sm.fit_resample(self.X, self.labels)
        
        # Recalculate energy for SMOTEd instances
#        self.restore_energy_labels()
        
#        if self.verbose:
#            print('Done SMOTEing')
            
        # Test/train split
        x_train, x_test, y_train, y_test = train_test_split(self.X, self.labels, test_size = .2, shuffle = True)

        if self.verbose:
            print('Training balance: %.2f. Testing balance: %.2f' % (np.sum(y_train)/len(y_train), np.sum(y_test)/len(y_test)))
        
        input_shape = None
        if is_cnn(model_s):
            grey2rgb = requires_rgb(model_s)
            x_train, input_shape = self.prepare_X_for_cnn(x_train, grey2rgb)
            x_test, _ = self.prepare_X_for_cnn(x_test, grey2rgb)
        
        # convert class vectors to binary class matrices
        y_train = keras.utils.to_categorical(y_train, n_classes)
        y_test = keras.utils.to_categorical(y_test, n_classes)
        
        # Squawk if desired
        if self.verbose:
            print('x_train shape:', x_train.shape)
            print(x_train.shape[0], 'train samples')
            print(x_test.shape[0], 'test samples')
        
        # Get it
        model = get_model(model_s, input_shape)
        
        """
        CNN will likely overfit XY states, at least on L = 7 lattice. Hence we need early stopping.
        Patience is set to epochs such that we keep looking for the best model over all epochs.
        """
        es = EarlyStopping(monitor = 'val_loss', mode = 'min', patience = epochs, verbose = 1)
        
        # We also want to store our best model, as judged by accuracy
        mc = ModelCheckpoint('Models/Epoch{epoch:02d}_Acc{val_acc:.2f}_V%d_L%d_M%d_N%d_%s.h5' % (int(self.X_vortex), self.L, self.M, self.N, model_s) , monitor='val_acc', mode='max', verbose=1, save_best_only=True)
        
        # Check for boosting
        if self.boost_nn and is_nn(model_s):
            # Different convention for labels. AdaBoostClassifier expects Y to be of form (nsamples,)
            # This in turn means models in get_model must be modified _WHEN_ used in conjuction with AdaBoostClf
            y_test = y_test[:, 0] + y_test[:, 1]*-1
            y_train = y_train[:, 0] + y_train[:, 1]*-1
            
            y_test = (y_test+1)/2
            y_train = (y_train+1)/2
            
            build = lambda: get_model(model_s, input_shape)
            est = KerasClassifier(build_fn = build, epochs = epochs, batch_size = batch_size, verbose = 0)
            
            model = AdaBoostClassifier(base_estimator = est, n_estimators = 1)
            x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = .1)
            print(x_train.shape, y_train.shape)
            model.fit(x_train, y_train)
            self.MODEL = model
            self.XTE = x_test
            # Need to construct our own history manually
            pred_val = model.staged_predict(x_val)
            pred_tr = model.staged_predict(x_train)
            
            accs_val = []
            accs_train = []
            
            for predv, predr in zip(pred_val, pred_tr):
                accs_val.append(accuracy_score(predv, y_val))
                accs_train.append(accuracy_score(predr, y_train))
            
            # Bit lazy, but using accuracy is less hassle. But then we need to trick ourselves:
            history = Bunch()
            history.history = {'loss': accs_train,
                                'val_loss': accs_val
                                }
            score = (-1, accuracy_score(model.predict(x_test), y_test))
            
            # If it's an AdaBoosted neural net, we won't do early stopping or save/load. 
            # It's hackish, but we just store it in instance. Why? Because we already know
            # it'll perform worse than a CNN, so it's not worth the effort at the moment.
            self.model_adaboost = model
        else:
            # Fit and record history
            history = model.fit(x_train, y_train,
                      batch_size=batch_size,
                      epochs=epochs,
                      verbose=1,
                      callbacks = [es, mc],
                      validation_split = 0.1)
            
            # Get the score on the unseen test set
            score = model.evaluate(x_test, y_test, verbose=0)
        
        # Squawk if desired
        if self.verbose:
            print('Test loss:', score[0])
            print('Test accuracy:', score[1])
            

        y_true = y_test[:, 1].astype(int)
        y_pred = np.round(model.predict(x_test)[:, 1]).astype(int)
        
        self.AA = y_true
        self.BB = y_pred
        
        print(classification_report(y_true, y_pred))
        self.f1 = f1_score(y_true, y_pred)
        print('F1-score: %.3f' % self.f1)
        print(confusion_matrix(y_true, y_pred))
        self.rocauc = roc_auc_score(y_true, y_pred)
        self.accuracy = accuracy_score(y_true, y_pred)
        
        # Plot training history
        fig = plt.figure()
        ax = fig.add_subplot(111)
        
        ax.plot(history.history['loss'], label = 'train')
        ax.plot(history.history['val_loss'], label = 'val')
        ax.set_xlabel('Epoch')
        ax.set_ylabel('Loss')
        if is_nn(model_s) and self.boost_nn:
            ax.set_ylabel('Accuracy')
        ax.set_title('Model: %s, Test score: %.3f' % (model_s, score[1]))
        ax.legend()
        
        # Save the plot to file
        plt.savefig('Plots/TrainTestScores/V%d_L%d_M%d_N%d_%s.png' % (int(self.X_vortex), self.L, self.M, self.N, model_s) )
        
        # Save a graph of the model
        plot_model(model, to_file = 'Plots/Model Graphs/%s.png' % (model_s)  )
        
        # And show plot if desired
        if self.plotty:
            plt.show()
                 58,115,99,84,132,13,35,77,89,113,102,36,38,
                 131,39,94,5,66,2,134,51,96,24,114,121,120,46]
print(selectedBands)
selectedArray = data[:, selectedBands]

# Making AdaBoost
bdt_real = AdaBoostClassifier(
                                DecisionTreeClassifier(max_depth=3),
                                n_estimators=500,
                                learning_rate=1)

bdt_real.fit(selectedArray[np.where(train != 0)[0]], train[train != 0])
real_test_errors = []

for real_test_predict in \
        bdt_real.staged_predict(selectedArray[np.where(test != 0)[0]]):
    real_test_errors.append(1. - accuracy_score(real_test_predict, test[test != 0]))

n_trees_real = len(bdt_real)
real_estimator_errors = bdt_real.estimator_errors_[:n_trees_real]

classified = bdt_real.predict(selectedArray[np.where(test != 0)[0]])
score = accuracy_score(classified, test[test != 0])
print(score)

plt.figure(figsize=(15, 5))
plt.plot(range(1, n_trees_real + 1),
         real_test_errors, c='black',
         linestyle='dashed', label='SAMME.R')
plt.show()
                #split into training and testing samples. test_size = proportion of data used for test
                x_train, x_test, y_train, y_test = train_test_split(value_to_classify, targs_to_classify, test_size = .4) 

                #########################
                #ADABoost Classifier
                #########################
                bdt_real = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),n_estimators=600,learning_rate=1)

                bdt_discrete = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),n_estimators=600,learning_rate=1.5,algorithm="SAMME")
                bdt_real.fit(x_train, y_train)
                bdt_discrete.fit(x_train, y_train)
            
                real_test_errors = []
                discrete_test_errors = []

                for real_test_predict, discrete_train_predict in zip(bdt_real.staged_predict(x_test), bdt_discrete.staged_predict(x_test)):
                    real_test_errors.append(1. - accuracy_score(real_test_predict, y_test))
                    discrete_test_errors.append(1. - accuracy_score(discrete_train_predict, y_test))

                n_trees_discrete = len(bdt_discrete)
                n_trees_real = len(bdt_real)

                # Boosting might terminate early, but the following arrays are always
                # n_estimators long. We crop them to the actual number of trees here:
                discrete_estimator_errors = bdt_discrete.estimator_errors_[:n_trees_discrete]
                real_estimator_errors = bdt_real.estimator_errors_[:n_trees_real]
                discrete_estimator_weights = bdt_discrete.estimator_weights_[:n_trees_discrete]
            
                # Test on the testing data set and display the accuracies
                ypred_r = bdt_real.predict(x_test)
                ypred_e = bdt_discrete.predict(x_test)
예제 #27
0
    try:
        randomIndex = int(random.uniform(0, len(train_mood_array)))
        test_word_arrayLabel.append(label[randomIndex])
        test_word_array.append(train_mood_array[randomIndex])
        del (train_mood_array[randomIndex])
        del (label[randomIndex])
    except Exception as e:
        print(e)
multi = MultinomialNB()
ada_real = AdaBoostClassifier(base_estimator=multi,
                              learning_rate=learning_rate,
                              n_estimators=n_estimators,
                              algorithm="SAMME.R")
ada_real.fit(train_mood_array, label)
ada_real_err = np.zeros((n_estimators, ))  # 变成一个一维的矩阵,长度为n
for i, y_pred in enumerate(ada_real.staged_predict(test_word_array)):  # 测试
    ada_real_err[i] = zero_one_loss(y_pred,
                                    test_word_arrayLabel)  # 得出不同的,然后除于总数
ada_real_err_train = np.zeros((n_estimators, ))
for i, y_pred in enumerate(
        ada_real.staged_predict(train_mood_array)):  # 训练样本对训练样本的结果
    ada_real_err_train[i] = zero_one_loss(y_pred, label)


def test(word):
    word_array = bayes.build_word_array(word)
    asfaiajioaf = bayes.setOfWordsListToVecTor(vocabList, word_array)
    return ada_real.predict(asfaiajioaf)[0]


def testandscore(word):
예제 #28
0
    learning_rate=1)

bdt_discrete = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=2),
    n_estimators=600,
    learning_rate=1.5,
    algorithm="SAMME")

bdt_real.fit(X_train, y_train)
bdt_discrete.fit(X_train, y_train)

real_test_errors = []
discrete_test_errors = []

for real_test_predict, discrete_train_predict in zip(
        bdt_real.staged_predict(X_test), bdt_discrete.staged_predict(X_test)):
    real_test_errors.append(
        1. - accuracy_score(real_test_predict, y_test))
    discrete_test_errors.append(
        1. - accuracy_score(discrete_train_predict, y_test))

n_trees_discrete = len(bdt_discrete)
n_trees_real = len(bdt_real)

# Boosting might terminate early, but the following arrays are always
# n_estimators long. We crop them to the actual number of trees here:
discrete_estimator_errors = bdt_discrete.estimator_errors_[:n_trees_discrete]
real_estimator_errors = bdt_real.estimator_errors_[:n_trees_real]
discrete_estimator_weights = bdt_discrete.estimator_weights_[:n_trees_discrete]

plt.figure(figsize=(15, 5))
예제 #29
0
    learning_rate=1)

bdt_discrete = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=2),
    n_estimators=600,
    learning_rate=1.5,
    algorithm="SAMME")

bdt_real.fit(X_train, y_train)
bdt_discrete.fit(X_train, y_train)

real_test_errors = []
discrete_test_errors = []

for real_test_predict, discrete_train_predict in zip(
        bdt_real.staged_predict(X_test), bdt_discrete.staged_predict(X_test)):
    real_test_errors.append(
        1. - accuracy_score(real_test_predict, y_test))
    discrete_test_errors.append(
        1. - accuracy_score(discrete_train_predict, y_test))

n_trees_discrete = len(bdt_discrete)
n_trees_real = len(bdt_real)

# Boosting might terminate early, but the following arrays are always
# n_estimators long. We crop them to the actual number of trees here:
discrete_estimator_errors = bdt_discrete.estimator_errors_[:n_trees_discrete]
real_estimator_errors = bdt_real.estimator_errors_[:n_trees_real]
discrete_estimator_weights = bdt_discrete.estimator_weights_[:n_trees_discrete]

plt.figure(figsize=(15, 5))
X, y = make_gaussian_quantiles(n_samples=13000, n_features=10,n_classes=3, random_state=1)
n_split = 3000

X_train, X_test = X[:n_split], X[n_split:]
y_train, y_test = y[:n_split], y[n_split:]
bdt_real = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),n_estimators=600,learning_rate=1)

bdt_discrete = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),n_estimators=600,learning_rate=1.5,algorithm="SAMME")

bdt_real.fit(X_train, y_train)
bdt_discrete.fit(X_train, y_train)

real_test_error = []
discrete_test_error = []

for rea_test_predict in bdt_real.staged_predict(X_test):
    real_test_error.append(1. - accuracy_score(rea_test_predict, y_test))
for discrete_test_predict in bdt_discrete.staged_predict(X_test):
    discrete_test_error.append(1. - accuracy_score(discrete_test_predict, y_test))
n_trees_discrete = len(bdt_discrete)
n_trees_real = len(bdt_real)

discrete_estimator_errors = bdt_discrete.estimator_errors_[:n_trees_discrete]
real_estimator_errors = bdt_real.estimator_errors_[:n_trees_real]
discrete_estimator_weights = bdt_discrete.estimator_weights_[:n_trees_discrete]

plt.figure(figsize=(15,5))
plt.subplot(131)
plt.plot(range(1, n_trees_discrete + 1),
         discrete_test_error, c='black', label='SAMME')
plt.plot(range(1, n_trees_real + 1),
예제 #31
0
    base_estimator=dt_stump,
    learning_rate=learning_rate,
    n_estimators=n_estimators,
    algorithm="SAMME.R")
ada_real.fit(X_train, y_train)

fig = plt.figure()
ax = fig.add_subplot(111)

ax.plot([1, n_estimators], [dt_stump_err] * 2, 'k-',
        label='Decision Stump Error')
ax.plot([1, n_estimators], [dt_err] * 2, 'k--',
        label='Decision Tree Error')

ada_discrete_err = np.zeros((n_estimators,))
for i, y_pred in enumerate(ada_discrete.staged_predict(X_test)):
    ada_discrete_err[i] = zero_one_loss(y_pred, y_test)

ada_discrete_err_train = np.zeros((n_estimators,))
for i, y_pred in enumerate(ada_discrete.staged_predict(X_train)):
    ada_discrete_err_train[i] = zero_one_loss(y_pred, y_train)

ada_real_err = np.zeros((n_estimators,))
for i, y_pred in enumerate(ada_real.staged_predict(X_test)):
    ada_real_err[i] = zero_one_loss(y_pred, y_test)

ada_real_err_train = np.zeros((n_estimators,))
for i, y_pred in enumerate(ada_real.staged_predict(X_train)):
    ada_real_err_train[i] = zero_one_loss(y_pred, y_train)

ax.plot(np.arange(n_estimators) + 1, ada_discrete_err,
예제 #32
0
plt.legend()
plt.xlabel("bumpiness")
plt.ylabel("grade")
plt.show()
################################################################################

### your code here!  name your classifier object clf if you want the
### visualization code (prettyPicture) to show you the decision boundary

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

clf = AdaBoostClassifier(DecisionTreeClassifier())

clf.fit(features_train, labels_train)
predict = clf.staged_predict(features_test)
result = np.asarray(list(predict)) - np.asarray(labels_test)

target_hit = 0
for item in result:
    number_of_hits = 0
    for num in item:
        if num == 0:
            number_of_hits += 1
    if number_of_hits == len(item) - 1:
        print("Found.")
        target_hit += 1
accuracy = float(target_hit) / len(result)
print(str(accuracy))

try:
예제 #33
0
def test_sparse_classification():
    # Check classification with sparse input.

    class CustomSVC(SVC):
        """SVC variant that records the nature of the training set."""
        def fit(self, X, y, sample_weight=None):
            """Modification on fit caries data type for later verification."""
            super(CustomSVC, self).fit(X, y, sample_weight=sample_weight)
            self.data_type_ = type(X)
            return self

    X, y = datasets.make_multilabel_classification(n_classes=1,
                                                   n_samples=15,
                                                   n_features=5,
                                                   random_state=42)
    # Flatten y to a 1d array
    y = np.ravel(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    for sparse_format in [
            csc_matrix, csr_matrix, lil_matrix, coo_matrix, dok_matrix
    ]:
        X_train_sparse = sparse_format(X_train)
        X_test_sparse = sparse_format(X_test)

        # Trained on sparse format
        sparse_classifier = AdaBoostClassifier(
            base_estimator=CustomSVC(probability=True),
            random_state=1,
            algorithm="SAMME").fit(X_train_sparse, y_train)

        # Trained on dense format
        dense_classifier = AdaBoostClassifier(
            base_estimator=CustomSVC(probability=True),
            random_state=1,
            algorithm="SAMME").fit(X_train, y_train)

        # predict
        sparse_results = sparse_classifier.predict(X_test_sparse)
        dense_results = dense_classifier.predict(X_test)
        assert_array_equal(sparse_results, dense_results)

        # decision_function
        sparse_results = sparse_classifier.decision_function(X_test_sparse)
        dense_results = dense_classifier.decision_function(X_test)
        assert_array_equal(sparse_results, dense_results)

        # predict_log_proba
        sparse_results = sparse_classifier.predict_log_proba(X_test_sparse)
        dense_results = dense_classifier.predict_log_proba(X_test)
        assert_array_equal(sparse_results, dense_results)

        # predict_proba
        sparse_results = sparse_classifier.predict_proba(X_test_sparse)
        dense_results = dense_classifier.predict_proba(X_test)
        assert_array_equal(sparse_results, dense_results)

        # score
        sparse_results = sparse_classifier.score(X_test_sparse, y_test)
        dense_results = dense_classifier.score(X_test, y_test)
        assert_array_equal(sparse_results, dense_results)

        # staged_decision_function
        sparse_results = sparse_classifier.staged_decision_function(
            X_test_sparse)
        dense_results = dense_classifier.staged_decision_function(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # staged_predict
        sparse_results = sparse_classifier.staged_predict(X_test_sparse)
        dense_results = dense_classifier.staged_predict(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # staged_predict_proba
        sparse_results = sparse_classifier.staged_predict_proba(X_test_sparse)
        dense_results = dense_classifier.staged_predict_proba(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # staged_score
        sparse_results = sparse_classifier.staged_score(X_test_sparse, y_test)
        dense_results = dense_classifier.staged_score(X_test, y_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # Verify sparsity of data is maintained during training
        types = [i.data_type_ for i in sparse_classifier.estimators_]

        assert all([(t == csc_matrix or t == csr_matrix) for t in types])
    ada_discrete = AdaBoostClassifier(
        base_estimator=dt_stump,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        algorithm="SAMME")
    ada_discrete.fit(X_train, y_train)

    ada_real = AdaBoostClassifier(
        base_estimator=dt_stump,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        algorithm="SAMME.R")
    ada_real.fit(X_train, y_train)

    ada_discrete_err = np.zeros((n_estimators,))
    for i, y_pred in enumerate(ada_discrete.staged_predict(X_test)):
        ada_discrete_err[i] = zero_one_loss(y_pred, y_test)/10.
    ada_discrete_err_ave += ada_discrete_err

    ada_discrete_err_train = np.zeros((n_estimators,))
    for i, y_pred in enumerate(ada_discrete.staged_predict(X_train)):
        ada_discrete_err_train[i] = zero_one_loss(y_pred, y_train)/10.
    ada_discrete_err_train_ave += ada_discrete_err_train

    ada_real_err = np.zeros((n_estimators,))
    for i, y_pred in enumerate(ada_real.staged_predict(X_test)):
        ada_real_err[i] = zero_one_loss(y_pred, y_test)/10.
    ada_real_err_ave += ada_real_err

    ada_real_err_train = np.zeros((n_estimators,))
    for i, y_pred in enumerate(ada_real.staged_predict(X_train)):
def test_sparse_classification():
    # Check classification with sparse input.

    class CustomSVC(SVC):
        """SVC variant that records the nature of the training set."""

        def fit(self, X, y, sample_weight=None):
            """Modification on fit caries data type for later verification."""
            super(CustomSVC, self).fit(X, y, sample_weight=sample_weight)
            self.data_type_ = type(X)
            return self

    X, y = datasets.make_multilabel_classification(n_classes=1, n_samples=15,
                                                   n_features=5,
                                                   random_state=42)
    # Flatten y to a 1d array
    y = np.ravel(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix,
                          dok_matrix]:
        X_train_sparse = sparse_format(X_train)
        X_test_sparse = sparse_format(X_test)

        # Trained on sparse format
        sparse_classifier = AdaBoostClassifier(
            base_estimator=CustomSVC(probability=True),
            random_state=1,
            algorithm="SAMME"
        ).fit(X_train_sparse, y_train)

        # Trained on dense format
        dense_classifier = AdaBoostClassifier(
            base_estimator=CustomSVC(probability=True),
            random_state=1,
            algorithm="SAMME"
        ).fit(X_train, y_train)

        # predict
        sparse_results = sparse_classifier.predict(X_test_sparse)
        dense_results = dense_classifier.predict(X_test)
        assert_array_equal(sparse_results, dense_results)

        # decision_function
        sparse_results = sparse_classifier.decision_function(X_test_sparse)
        dense_results = dense_classifier.decision_function(X_test)
        assert_array_equal(sparse_results, dense_results)

        # predict_log_proba
        sparse_results = sparse_classifier.predict_log_proba(X_test_sparse)
        dense_results = dense_classifier.predict_log_proba(X_test)
        assert_array_equal(sparse_results, dense_results)

        # predict_proba
        sparse_results = sparse_classifier.predict_proba(X_test_sparse)
        dense_results = dense_classifier.predict_proba(X_test)
        assert_array_equal(sparse_results, dense_results)

        # score
        sparse_results = sparse_classifier.score(X_test_sparse, y_test)
        dense_results = dense_classifier.score(X_test, y_test)
        assert_array_equal(sparse_results, dense_results)

        # staged_decision_function
        sparse_results = sparse_classifier.staged_decision_function(
            X_test_sparse)
        dense_results = dense_classifier.staged_decision_function(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # staged_predict
        sparse_results = sparse_classifier.staged_predict(X_test_sparse)
        dense_results = dense_classifier.staged_predict(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # staged_predict_proba
        sparse_results = sparse_classifier.staged_predict_proba(X_test_sparse)
        dense_results = dense_classifier.staged_predict_proba(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # staged_score
        sparse_results = sparse_classifier.staged_score(X_test_sparse,
                                                        y_test)
        dense_results = dense_classifier.staged_score(X_test, y_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # Verify sparsity of data is maintained during training
        types = [i.data_type_ for i in sparse_classifier.estimators_]

        assert all([(t == csc_matrix or t == csr_matrix)
                   for t in types])
예제 #36
0
        for w in [DT1, DT2, DT4]:
            # Weak classifier
            w.fit(X_train, y_train)
            err = 1.0 - w.score(X_train, y_train)
            ax.plot([1, n_estimators], [err] * 2,
                    styles[j],
                    label="Decision Tree, max depth %d (DT%d)" %
                    (depths[j], depths[j]))
            # AdaBoost classifier
            ada = AdaBoostClassifier(base_estimator=w,
                                     n_estimators=n_estimators,
                                     random_state=0)

            ada_train_err = np.zeros((n_estimators, ))
            ada.fit(X_train, y_train)
            for i, y_pred in enumerate(ada.staged_predict(X_train)):
                ada_train_err[i] = zero_one_loss(y_pred, y_train)

            smoothed = []
            # use moving average filter to smooth plots -- done to make easier
            # to see trends; you are encouraged to also plot 'ada_train_err' to
            # see the actual error plots!!
            for i in range(len(ada_train_err)):
                temp = 0.
                counter = 0.
                for k in range(i - 5, i + 1):
                    if k >= 0:
                        temp += ada_train_err[k]
                        counter += 1.
                smoothed.append(temp / counter)
예제 #37
0
             label='Class %s' % n,
             alpha=.5,
             edgecolor='k')
x1, x2, y1, y2 = plt.axis()
plt.axis((x1, x2, y1, y2 * 1.2))
plt.legend(loc='upper right')
plt.ylabel('Samples')
plt.xlabel('Score')
plt.title('Decision Scores')

plt.tight_layout()
plt.subplots_adjust(wspace=0.35)
plt.show()

ada_discrete_err_train = np.zeros((2000, ))
for i, y_pred in enumerate(bdt.staged_predict(X)):
    ada_discrete_err_train[i] = zero_one_loss(y, y_pred)

ada_discrete_err_train_hinge = np.zeros((2000, ))
for i, y_pred in enumerate(bdt.staged_predict(X)):
    ada_discrete_err_train_hinge[i] = mean_squared_error(y, y_pred)
fig = plt.figure(2)
ax = fig.add_subplot(111)

ax.plot(np.arange(2000) + 1,
        ada_discrete_err_train_hinge,
        label='AdaBoost Train Error MSE',
        color='red')

ax.plot(np.arange(2000) + 1,
        ada_discrete_err_train,
예제 #38
0
X = np.genfromtxt("Data/train_mice2000PCA.csv", delimiter=",")
print(" read X ")
Y = np.genfromtxt("Data/train_Y.csv", delimiter=",", dtype='int32')
test = np.genfromtxt("Data/test_mice2000PCA.csv", delimiter=",")
test_Y = np.zeros((test.shape[0], 1))

bdt_discrete = AdaBoostClassifier(DecisionTreeClassifier(max_depth=7),
                                  n_estimators=20,
                                  learning_rate=1.5,
                                  algorithm="SAMME")
print("fitting ")
bdt_discrete.fit(X, Y)

print("score :", bdt_discrete.score(X, Y))

filename = 'adaboost_model.sav'
pickle.dump(bdt_discrete, open(filename, 'wb'))
pred = bdt_discrete.staged_predict(test)
pred = pred.reshape(-1, 1)

pred = pred.reshape((-1, 1))
pred = pred.astype(np.int64)

idx = np.arange(test.shape[0]).reshape((-1, 1))
idx = idx.astype(np.int64)

output = np.concatenate((idx, pred), axis=1)
np.savetxt("results/adaboost_2000ftrs.csv",
           output.astype(int),
           fmt='%i',
           delimiter=",")
예제 #39
0
n_split = 109296

X_train, X_test = X[:n_split], X[n_split:]
y_train, y_test = y[:n_split], y[n_split:]

bdt_discrete = AdaBoostClassifier(DecisionTreeClassifier(max_depth=None),
                                  n_estimators=50,
                                  learning_rate=1,
                                  algorithm="SAMME")

bdt_discrete.fit(X_train, y_train)

discrete_test_errors = []
discrete_test_accuracy = []

for discrete_train_predict in bdt_discrete.staged_predict(X_test):
    discrete_test_accuracy.append(
        accuracy_score(discrete_train_predict, y_test))
    discrete_test_errors.append(1. -
                                accuracy_score(discrete_train_predict, y_test))

n_trees_discrete = len(bdt_discrete)

discrete_estimator_errors = bdt_discrete.estimator_errors_[:n_trees_discrete]
discrete_estimator_weights = bdt_discrete.estimator_weights_[:n_trees_discrete]

print("The estimator error of 50 stages are : ", discrete_estimator_errors)
print("The estimator weights of 50 stages are : ", discrete_estimator_weights)
print("The accuracy scores of 50 stages are : ", discrete_test_accuracy)

n_trees_discrete = len(bdt_discrete)
예제 #40
0
def run_main():
    tournament_season = 2003

    summary_data = read_summary_team_data()
    teams = read_team_meta_data()
    tourney_data = read_tournament_results(tournament_season)
    game_data = utils.compute_game_data(tourney_data, teams)

    computer_rankings = pd.read_csv(Path('../data/massey_seasons_with_id.csv'))
    computer_rankings = computer_rankings[
        computer_rankings['season'] >= tournament_season]

    tourney_data = recode_tourney_data(tourney_data)
    tourney_data = merge_tourney_summary_data(tourney_data, summary_data)
    tourney_data = join_tourney_team_data(tourney_data, teams)

    tourney_comp_ratings = merge_tourney_ranking_data(tourney_data,
                                                      computer_rankings)
    tourney_comp_ratings = utils.implement_top_conference_feature(
        tourney_data, teams, game_data, tourney_comp_ratings)
    tourney_comp_ratings = utils.implement_seed_threshold_feature(
        tourney_comp_ratings)
    tourney_comp_ratings = compute_delta_features(tourney_comp_ratings)

    feature_data = tourney_comp_ratings.drop(columns=[
        'round', 'game_date', 'seed_t', 'team_t', 'team_id_t', 'team_id_o',
        'team_o', 'seed_o', 'team_id_o', 'game_result', 'start_season',
        'game result', 'conf_name_t', 'conf_name_o'
    ]).copy()

    feature_data.drop(columns=[
        'pts_avg_t', 'pts_avg_o', 'opp_pts_avg_t', 'opp_pts_avg_o',
        'margin_victory_avg_t', 'margin_victory_avg_o', 'poss_avg_t',
        'poss_avg_o', 'fg_pct_t', 'fg_pct_o', 'off_rebs_avg_t',
        'off_rebs_avg_o', 'def_rebs_avg_t', 'def_rebs_avg_o', 'ft_pct_t',
        'ft_pct_o', 'to_avg_t', 'to_avg_o', 'steal_avg_t', 'steal_avg_o',
        'to_net_avg_t', 'to_net_avg_o', 'win_pct_t', 'win_pct_o',
        'off_rating_t', 'off_rating_o', 'ft_att_avg_t', 'ft_att_avg_o',
        'opp_pts_avg_t', 'opp_pts_avg_o', 'srs_t', 'srs_o', 'sos_t', 'sos_o',
        'sag_t', 'sag_o', 'wlk_t', 'wlk_o', 'wol_t', 'wol_o', 'rth_t', 'rth_o',
        'col_t', 'col_o', 'pom_t', 'pom_o', 'dol_t', 'dol_o', 'rpi_t', 'rpi_o',
        'mor_t', 'mor_o'
    ],
                      inplace=True)

    # for now drop the delta seed features
    feature_data.drop(columns=['upset_seed_threshold'], inplace=True)

    X = feature_data[feature_data['season_t'] >= tournament_season]
    y = tourney_comp_ratings[
        tourney_comp_ratings['season_t'] >= tournament_season]['game_result']
    X = X.drop(columns=['season_t'])

    feature_list = list(X)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=5)

    number_estimators = 501
    bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                             algorithm="SAMME.R",
                             n_estimators=number_estimators,
                             learning_rate=1)

    bdt.fit(X_train, y_train)
    score = bdt.score(X_train, y_train)
    print("Training Model Score= ", score)

    y_pred = bdt.predict(X_test)
    print("AdaBoost model accuracy is %2.2f" %
          metrics.accuracy_score(y_test, y_pred))

    prediction_probabilities = bdt.predict_proba(X_test)
    win_probabilities = pd.Series(prediction_probabilities[:, 1],
                                  index=X_test.index)
    predictions = pd.Series(y_pred, index=y_test.index)
    test_games = tourney_comp_ratings[tourney_comp_ratings.index.isin(
        X_test.index)].copy()

    test_games['predicted_result'] = predictions
    test_games['pred_win_prob'] = win_probabilities

    missed_predictions = test_games[
        test_games['game_result'] !=
        test_games['predicted_result']].sort_values(by='pred_win_prob',
                                                    ascending=False)

    print("Missed predictions= ", missed_predictions.shape[0])

    feature_dictionary = utils.Feature_Dictionary()
    missed_predictions.apply(lambda x: feature_dictionary.print_game_info(
        test_games, x['season_t'], x['round'], x['team_t']),
                             axis=1)

    supporting_features = missed_predictions.apply(
        lambda row: utils.get_supporting_features(row, feature_dictionary,
                                                  feature_list),
        axis=1)

    missed_predictions = missed_predictions.merge(
        supporting_features.to_frame(name='supporting_features'),
        how='left',
        left_index=True,
        right_index=True)

    missed_predictions['features'] = 100 * missed_predictions[
        'supporting_features'].apply(lambda x: len(x)) / len(feature_list)

    missed_predictions['game_index'] = missed_predictions.index

    plot_missed_predictions_df = missed_predictions[['game_index', 'features']]
    plot_missed_predictions_df = pd.melt(
        plot_missed_predictions_df,
        id_vars='game_index',
        var_name='Features Supporting Outcome')
    # plot_missed_predictions_df.head()
    # m_plot = sns.barplot(x='game_index', y='value', hue='Features Supporting Outcome',
    #                      data=plot_missed_predictions_df)
    # plt.title("Percentage Of Features Consistent With Incorrectly Predicted Game Outcomes")
    # plt.ylabel('Percentage')
    # plt.xlabel('Game Index')
    # m_plot.figure.set_size_inches(20, 6)

    print("Missed Predictions with greater than 50% feature support")
    print(plot_missed_predictions_df[plot_missed_predictions_df['value'] > 50])

    # analyze game index 246
    game_index = 246
    print(missed_predictions.loc[game_index])
    missed_game = X.loc[game_index].to_frame().T

    staged_predictions = bdt.staged_predict(missed_game)

    class_team_votes = 0
    class_opp_votes = 0

    label_dict = {}

    estimator_stubs = []
    for stub_estimator in bdt.estimators_:
        stub_tree = stub_estimator.tree_
        stub_feature_index = stub_tree.feature[0]
        stub_feature = missed_game.columns[stub_feature_index]

        if stub_feature in label_dict:
            label_dict[stub_feature] += 1
        else:
            label_dict[stub_feature] = 1

        threshold_value = stub_tree.threshold[0]
        test_value = missed_game.iloc[0, stub_feature_index]
        left_child_node = stub_tree.children_left[0]
        left_values = stub_tree.value[left_child_node][0]
        right_child_node = stub_tree.children_right[0]
        right_values = stub_tree.value[right_child_node][0]
        node_samples = stub_tree.n_node_samples
        test_string = "Test: {0} ({1:6.3f}) <= {2:6.3f}".format(
            stub_feature, test_value, threshold_value)
        left_string = "Left: Samples= {0},  Values= {1:5.3f}, {2:5.3f}".format(
            node_samples[left_child_node], left_values[0], left_values[1])

        right_string = "Right: Samples= {0},  Values= {1:5.3f}, {2:5.3f}".format(
            node_samples[right_child_node], right_values[0], right_values[1])

        if test_value <= threshold_value:
            # choose left node
            if left_values[0] >= left_values[1]:
                result_string = "Result: Left --> Choose Class -1 --> Opp Team Wins"
                class_opp_votes += 1
            else:
                result_string = "Result: Left --> Choose Class 1  --> Team Wins"
                class_team_votes += 1
        else:
            # choose right node
            if right_values[0] >= right_values[1]:
                result_string = "Result: Right --> Choose Class -1 --> Opp Team Wins"
                class_opp_votes += 1
            else:
                result_string = "Result: Right --> Choose Class 1 --> Team Wins"
                class_team_votes += 1

        if next(staged_predictions) == 1:
            staged_prediction_string = 'Stage Prediction= Class 1'
        else:
            staged_prediction_string = 'Stage Prediction= Class -1'

        stub_dict = {
            'test_string': test_string,
            'left_string': left_string,
            'right_string': right_string,
            'result_string': result_string,
            'staged_prediction_string': staged_prediction_string
        }
        estimator_stubs.append(stub_dict)

    item_count = 0
    for item in estimator_stubs:
        print("Estimator: ", item_count)
        print(item['test_string'])
        print(item['left_string'])
        print(item['right_string'])
        print(item['result_string'])
        print(item['staged_prediction_string'])
        print("-----------")
        item_count += 1

    print("Class Team Votes= ", class_team_votes)
    print("Class Opp Votes= ", class_opp_votes)
    print()
    print("Number of features in tree stumps= ", len(label_dict))
    for key, value in label_dict.items():
        print('Feature: ', key, ' Count= ', value)

    return

# Adaboost with tree method with out PCA
#import warnings
#warnings.filterwarnings("ignore")
from imblearn.over_sampling import SMOTE # doctest: +NORMALIZE_WHITESPACE
from collections import Counter

from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
clf_no_pca = AdaBoostClassifier(n_estimators=1000, random_state=1)
clf_no_pca.fit(model_tr1, y_train)
real_val_macro1 = [0]
train_macro1 = []
for real_test_predict in clf_no_pca.staged_predict(model_val1):
    if f1_score(y_val, real_test_predict, average='macro')> np.max(real_val_macro1) :
        pred_opt1 = real_test_predict
    real_val_macro1.append(
        f1_score(y_val, real_test_predict, average='macro'))
for real_train_predict in clf_no_pca.staged_predict(model_tr1):
    train_macro1.append(f1_score(y_train, real_train_predict, average='macro'))


# In[22]:


plt.figure()
plt.plot(range(1, n_trees_real + 1),
         real_val_macro1[1:], 
         label='val')
예제 #42
0
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.8,
                                                    random_state=SEED)

# clf = AdaBoostM1(n_estimators=100)
clf = AdaBoostClassifier(n_estimators=400, learning_rate=1)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_train)
print(f"Final training error rate:{ error_rate(y_train, y_pred)}")
y_pred = clf.predict(X_test)
print(f"Final test error rate:{ error_rate(y_test, y_pred)}")

training_errors = []
test_errors = []
for y_pred in clf.staged_predict(X_train):
    training_errors.append(error_rate(y_train, y_pred))

for y_pred in clf.staged_predict(X_test):
    test_errors.append(error_rate(y_test, y_pred))

fig1, ax = plt.subplots()
ax.plot(training_errors, c="tab:blue", label="Training error")
ax.plot(test_errors, c="tab:red", label="Test error")
plt.legend()
plt.ylabel("Misclassification error rate")
plt.xlabel("Boosting iteration")
plt.savefig("test_error.svg")
plt.close()
예제 #43
0
from classifierWithFS import preLoadData, getBestMeanFeatures, selectFeatures

X_train, y_train_phq8, X_test, y_dev, y_train, y_test = preLoadData()
maxFeature, featureChoices = getBestMeanFeatures()

X_train, X_test, chosenFeatures, numOfFeatures = selectFeatures(
    maxFeature, featureChoices, X_train, X_test)

bdt_real = AdaBoostClassifier(random_state=13370)

bdt_real.fit(X_train, y_train)

real_test_errors = []

for real_test_predict in bdt_real.staged_predict(X_test):
    real_test_errors.append(1. - accuracy_score(real_test_predict, y_test))

n_trees_real = len(bdt_real)

# Boosting might terminate early, but the following arrays are always
# n_estimators long. We crop them to the actual number of trees here:
real_estimator_errors = bdt_real.estimator_errors_[:n_trees_real]
real_estimator_weights = bdt_real.estimator_weights_[:n_trees_real]

#plt.figure(figsize=(15, 5))

#plt.subplot(131)
plt.plot(range(1, n_trees_real + 1),
         real_test_errors,
         c='black',
예제 #44
0
파일: read_tree.py 프로젝트: alexshires/ml
    y_train = np.append(os, zs)
    print "training"
    base_ada.fit(X=X_train, y=y_train)

    os = np.ones(len(bkgtest))
    zs = np.zeros(len(sigtest))
    print "adding samples together"
    X_test = pandas.concat([sigtest, bkgtest])
    y_test = np.append(os, zs)


    sigoutput = base_ada.decision_function(X=sigtest)
    bkgoutput = base_ada.decision_function(X=bkgtest)
    from sklearn.metrics import accuracy_score
    test_errors = []
    for te in base_ada.staged_predict(X_test):
        test_errors.append(1.- accuracy_score(te, y_test))
    ntrees = len(test_errors)
    estimator_errors = base_ada.estimator_errors_[:ntrees]
    estimator_weights = base_ada.estimator_weights_[:ntrees]

    from matplotlib.ticker import LinearLocator

    with PdfPages("bdtplots.pdf") as pdf:
        xs, xe, ys, ye = get_hist(bkgoutput)
        plt.errorbar(xs, ys, xerr=xe, yerr=ye,
                     color='red', fmt='.',
                     label='bkg')
        xs, xe, ys, ye = get_hist(sigoutput)
        plt.errorbar(xs, ys, xerr=xe, yerr=ye,
                     color='blue', fmt='.',