예제 #1
0
def solution(filename):
    # Считывание данных
    data = pd.read_csv(filename + '.csv')
    # data['class'] = pd.factorize(data['class'])[0]
    y = data['class'].values
    x = data.drop(['class'], axis=1).values

    # Разбиение данных на тестовую и тренировочную выборки
    # x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15)

    # Алгоритм бустинга
    if filename == 'chips':
        learning_rate = 0.15
    else:
        learning_rate = 1.15
    model = AdaBoostClassifier(learning_rate=learning_rate, n_estimators=150)
    model.fit(x, y)

    # Построение графика для каждого шага алгоритма
    plt.figure()
    plt.scatter(x[:, 0], x[:, 1], c=colors(y), cmap=plt.cm.Paired)
    ax = plt.gca()
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()
    xx = np.linspace(xlim[0], xlim[1], 30)
    yy = np.linspace(ylim[0], ylim[1], 30)
    YY, XX = np.meshgrid(yy, xx)
    xy = np.vstack([XX.ravel(), YY.ravel()]).T

    y_pred = model.staged_predict(x)
    decision_funcs = model.staged_decision_function(xy)

    i = 1
    for _, func in zip(y_pred, decision_funcs):
        plt.clf()
        plt.scatter(x[:, 0], x[:, 1], c=colors(y), cmap=plt.cm.Paired)
        f = func.reshape(XX.shape)
        ax = plt.gca()
        ax.contour(XX,
                   YY,
                   f,
                   colors='k',
                   levels=[-1, 0, 1],
                   alpha=0.5,
                   linestyles=['--', '-', '--'])
        plt.legend([f'Step number {i}'])
        plt.savefig(filename + '/' + str(i) + '.png')

        i += 1
def test_sparse_classification():
    # Check classification with sparse input.

    class CustomSVC(SVC):
        """SVC variant that records the nature of the training set."""

        def fit(self, X, y, sample_weight=None):
            """Modification on fit caries data type for later verification."""
            super(CustomSVC, self).fit(X, y, sample_weight=sample_weight)
            self.data_type_ = type(X)
            return self

    X, y = datasets.make_multilabel_classification(n_classes=1, n_samples=15,
                                                   n_features=5,
                                                   random_state=42)
    # Flatten y to a 1d array
    y = np.ravel(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix,
                          dok_matrix]:
        X_train_sparse = sparse_format(X_train)
        X_test_sparse = sparse_format(X_test)

        # Trained on sparse format
        sparse_classifier = AdaBoostClassifier(
            base_estimator=CustomSVC(probability=True),
            random_state=1,
            algorithm="SAMME"
        ).fit(X_train_sparse, y_train)

        # Trained on dense format
        dense_classifier = AdaBoostClassifier(
            base_estimator=CustomSVC(probability=True),
            random_state=1,
            algorithm="SAMME"
        ).fit(X_train, y_train)

        # predict
        sparse_results = sparse_classifier.predict(X_test_sparse)
        dense_results = dense_classifier.predict(X_test)
        assert_array_equal(sparse_results, dense_results)

        # decision_function
        sparse_results = sparse_classifier.decision_function(X_test_sparse)
        dense_results = dense_classifier.decision_function(X_test)
        assert_array_equal(sparse_results, dense_results)

        # predict_log_proba
        sparse_results = sparse_classifier.predict_log_proba(X_test_sparse)
        dense_results = dense_classifier.predict_log_proba(X_test)
        assert_array_equal(sparse_results, dense_results)

        # predict_proba
        sparse_results = sparse_classifier.predict_proba(X_test_sparse)
        dense_results = dense_classifier.predict_proba(X_test)
        assert_array_equal(sparse_results, dense_results)

        # score
        sparse_results = sparse_classifier.score(X_test_sparse, y_test)
        dense_results = dense_classifier.score(X_test, y_test)
        assert_array_equal(sparse_results, dense_results)

        # staged_decision_function
        sparse_results = sparse_classifier.staged_decision_function(
            X_test_sparse)
        dense_results = dense_classifier.staged_decision_function(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # staged_predict
        sparse_results = sparse_classifier.staged_predict(X_test_sparse)
        dense_results = dense_classifier.staged_predict(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # staged_predict_proba
        sparse_results = sparse_classifier.staged_predict_proba(X_test_sparse)
        dense_results = dense_classifier.staged_predict_proba(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # staged_score
        sparse_results = sparse_classifier.staged_score(X_test_sparse,
                                                        y_test)
        dense_results = dense_classifier.staged_score(X_test, y_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # Verify sparsity of data is maintained during training
        types = [i.data_type_ for i in sparse_classifier.estimators_]

        assert all([(t == csc_matrix or t == csr_matrix)
                   for t in types])
예제 #3
0
def test_sparse_classification():
    # Check classification with sparse input.

    class CustomSVC(SVC):
        """SVC variant that records the nature of the training set."""
        def fit(self, X, y, sample_weight=None):
            """Modification on fit caries data type for later verification."""
            super(CustomSVC, self).fit(X, y, sample_weight=sample_weight)
            self.data_type_ = type(X)
            return self

    X, y = datasets.make_multilabel_classification(n_classes=1,
                                                   n_samples=15,
                                                   n_features=5,
                                                   random_state=42)
    # Flatten y to a 1d array
    y = np.ravel(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    for sparse_format in [
            csc_matrix, csr_matrix, lil_matrix, coo_matrix, dok_matrix
    ]:
        X_train_sparse = sparse_format(X_train)
        X_test_sparse = sparse_format(X_test)

        # Trained on sparse format
        sparse_classifier = AdaBoostClassifier(
            base_estimator=CustomSVC(probability=True),
            random_state=1,
            algorithm="SAMME").fit(X_train_sparse, y_train)

        # Trained on dense format
        dense_classifier = AdaBoostClassifier(
            base_estimator=CustomSVC(probability=True),
            random_state=1,
            algorithm="SAMME").fit(X_train, y_train)

        # predict
        sparse_results = sparse_classifier.predict(X_test_sparse)
        dense_results = dense_classifier.predict(X_test)
        assert_array_equal(sparse_results, dense_results)

        # decision_function
        sparse_results = sparse_classifier.decision_function(X_test_sparse)
        dense_results = dense_classifier.decision_function(X_test)
        assert_array_equal(sparse_results, dense_results)

        # predict_log_proba
        sparse_results = sparse_classifier.predict_log_proba(X_test_sparse)
        dense_results = dense_classifier.predict_log_proba(X_test)
        assert_array_equal(sparse_results, dense_results)

        # predict_proba
        sparse_results = sparse_classifier.predict_proba(X_test_sparse)
        dense_results = dense_classifier.predict_proba(X_test)
        assert_array_equal(sparse_results, dense_results)

        # score
        sparse_results = sparse_classifier.score(X_test_sparse, y_test)
        dense_results = dense_classifier.score(X_test, y_test)
        assert_array_equal(sparse_results, dense_results)

        # staged_decision_function
        sparse_results = sparse_classifier.staged_decision_function(
            X_test_sparse)
        dense_results = dense_classifier.staged_decision_function(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # staged_predict
        sparse_results = sparse_classifier.staged_predict(X_test_sparse)
        dense_results = dense_classifier.staged_predict(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # staged_predict_proba
        sparse_results = sparse_classifier.staged_predict_proba(X_test_sparse)
        dense_results = dense_classifier.staged_predict_proba(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # staged_score
        sparse_results = sparse_classifier.staged_score(X_test_sparse, y_test)
        dense_results = dense_classifier.staged_score(X_test, y_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # Verify sparsity of data is maintained during training
        types = [i.data_type_ for i in sparse_classifier.estimators_]

        assert all([(t == csc_matrix or t == csr_matrix) for t in types])
예제 #4
0
               attributes[:, 1],
               c=colors(targetValues),
               cmap=pyplot.cm.Paired)

axes = pyplot.gca()
xLim = axes.get_xlim()
yLim = axes.get_ylim()

xs = numpy.linspace(xLim[0], xLim[1], 30)
ys = numpy.linspace(yLim[0], yLim[1], 30)

xCoords, yCoords = numpy.meshgrid(xs, ys)

xyCoords = numpy.vstack([xCoords.ravel(), yCoords.ravel()]).T

Z = classifier.staged_decision_function(xyCoords)

iterationsCount = []
accuracies = []

os.mkdir(fileName)

i = 1
for item, z in zip(targetValuePredictedFunctions, Z):
    pyplot.clf()

    pyplot.xlabel("Step number " + str(i))
    pyplot.scatter(attributes[:, 0],
                   attributes[:, 1],
                   c=colors(targetValues),
                   cmap=pyplot.cm.Paired)
    # Plot the training points
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
    # and testing points
    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xticks(())
    ax.set_yticks(())
    i = 1
    plt.show()

    clf.fit(X_train, y_train)
    score = clf.staged_score(X_test, y_test)
    step = 1
    for S, Z, est in zip(
            score, clf.staged_decision_function(np.c_[xx.ravel(),
                                                      yy.ravel()]),
            clf.estimators_):
        # iterate over classifiers
        ax = plt.subplot(1, 1, i)

        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

        #size = np.repeat(2.0, X_train.shape[0]) + (6-2.0)*(est.sample_weight - est.sample_weight.min())/(est.sample_weight.max() - est.sample_weight.min() + 0.0001)
        size = np.repeat(2.0, X_train.shape[0])
        size = 2 * np.pi * size**2
        # Plot also the training points
        ax.scatter(X_train[:, 0],
                   X_train[:, 1],
                   c=y_train,