def main():
    # prepare sample data and target variable
    labels = ['setosa', 'versicolor']
    features = ['petal length (cm)', 'petal width (cm)']
    D = IrisData(features, labels)
    X = D.X
    y = D.y

    # split sample data into training data and test data and standardize them
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)
    sc = StandardScaler().fit(X_train)
    X_train_std = sc.transform(X_train)
    X_test_std = sc.transform(X_test)

    # fit classifiers
    classifier = LogisticRegressionGD(eta=0.05, n_iter=1000, random_state=1).fit(X_train_std, y_train)

    # show accuracy
    y_pred = classifier.predict(X_test_std)
    print('misclassified samples: {}'.format(np.sum(y_test != y_pred)))

    # show history of costs
    plot_update_history(classifier)

    # show decision regions
    plot_decision_regions(X_train_std, y_train, classifier=classifier)
def main():
    # prepare sample data and target variable
    labels = ['setosa', 'versicolor', 'virginica']
    features = ['petal length (cm)', 'petal width (cm)']
    D = IrisData(features, labels)
    X = D.X
    y = D.y

    # split sample data into training data and test data and standardize them
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=1,
                                                        stratify=y)
    sc = StandardScaler().fit(X_train)
    X_train_std = sc.transform(X_train)
    X_test_std = sc.transform(X_test)

    # combine training data and test data
    X_combined_std = np.vstack((X_train_std, X_test_std))
    y_combined = np.hstack((y_train, y_test))

    # fit classifiers
    classifier = LogisticRegression(C=100.0,
                                    random_state=1,
                                    solver='liblinear',
                                    multi_class='ovr').fit(
                                        X_train_std, y_train)

    # show accuracy
    y_pred = classifier.predict(X_test_std)
    print('misclassified samples: {}'.format(np.sum(y_test != y_pred)))

    # show decision regions
    plot_decision_regions(X_combined_std,
                          y_combined,
                          classifier=classifier,
                          test_idx=list(range(len(y_train), len(y))))

    # show effect of regularization parameter
    weights = []
    params = []
    for i in np.arange(-5, 5):
        C = 10.0**i
        classifier = LogisticRegression(C=C,
                                        random_state=1,
                                        solver='liblinear',
                                        multi_class='ovr').fit(
                                            X_train_std, y_train)
        weights.append(classifier.coef_[1])
        params.append(C)
    weights = np.array(weights)
    plt.axhline(y=0, linewidth=1, linestyle='--', color='k')
    plt.plot(params, weights[:, 0], label='petal length')
    plt.plot(params, weights[:, 1], linestyle='--', label='petal width')
    plt.xscale('log')
    plt.xlabel('C')
    plt.ylabel('weight coefficient')
    plt.show()
def main():
    # prepare sample data and target variable
    labels = ['setosa', 'versicolor', 'virginica']
    features = ['petal length (cm)', 'petal width (cm)']
    D = IrisData(features, labels)
    X = D.X
    y = D.y

    # split sample data into training data and test data
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=1,
                                                        stratify=y)

    # combine training data and test data
    X_combined = np.vstack((X_train, X_test))
    y_combined = np.hstack((y_train, y_test))

    # fit classifiers
    classifiers = [
        RandomForestClassifier(criterion='gini',
                               n_estimators=10,
                               random_state=1,
                               n_jobs=2).fit(X_train, y_train),
        RandomForestClassifier(criterion='gini',
                               n_estimators=25,
                               random_state=1,
                               n_jobs=2).fit(X_train, y_train),
        RandomForestClassifier(criterion='gini',
                               n_estimators=50,
                               random_state=1,
                               n_jobs=2).fit(X_train, y_train),
        RandomForestClassifier(criterion='gini',
                               n_estimators=100,
                               random_state=1,
                               n_jobs=2).fit(X_train, y_train),
        RandomForestClassifier(criterion='entropy',
                               n_estimators=100,
                               random_state=1,
                               n_jobs=2).fit(X_train, y_train)
    ]

    for classifier in classifiers:
        # show accuracy
        y_pred = classifier.predict(X_test)
        print('misclassified samples: {}'.format(np.sum(y_test != y_pred)))

        # show decision regions
        plot_decision_regions(X_combined,
                              y_combined,
                              classifier=classifier,
                              test_idx=list(range(len(y_train), len(y_test))))
def main():
    # prepare sample data and target variable
    features = ['alcohol', 'od280/od315_of_diluted_wines']
    labels = ['class_1', 'class_2']
    wine_data = WineData(features, labels)
    X = wine_data.X
    y = wine_data.y

    # split sample data into training data and test data
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=1,
                                                        stratify=y)
    X_combined = np.vstack((X_train, X_test))
    y_combined = np.hstack((y_train, y_test))

    # fit classifiers
    decision_tree = DecisionTreeClassifier(criterion='entropy',
                                           max_depth=None,
                                           random_state=1)
    bagging = BaggingClassifier(base_estimator=decision_tree,
                                n_estimators=500,
                                max_samples=1.0,
                                max_features=1.0,
                                bootstrap=True,
                                bootstrap_features=False,
                                n_jobs=1,
                                random_state=1)
    classifiers = [decision_tree, bagging]
    for classifier in classifiers:
        classifier.fit(X_train, y_train)

    names = ['decision tree', 'bagging']
    for classifier, name in zip(classifiers, names):
        # show score
        print('[{name}]'.format(name=name))
        print('training score:', classifier.score(X_train, y_train))
        print('test score:', classifier.score(X_test, y_test))

        # show decision regions
        plot_decision_regions(X_combined,
                              y_combined,
                              classifier=classifier,
                              test_idx=list(range(len(y_train), len(y))),
                              title=name)
def main():
    # prepare sample data and target variable
    labels = ['setosa', 'versicolor', 'virginica']
    features = ['petal length (cm)', 'petal width (cm)']
    D = IrisData(features, labels)
    X = D.X
    y = D.y

    # split sample data into training data and test data and standardize them
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=1,
                                                        stratify=y)
    sc = StandardScaler().fit(X_train)
    X_train_std = sc.transform(X_train)
    X_test_std = sc.transform(X_test)

    # combine training data and test data
    X_combined_std = np.vstack((X_train_std, X_test_std))
    y_combined = np.hstack((y_train, y_test))

    # fit classifiers
    classifiers = [
        KNeighborsClassifier(n_neighbors=4, p=2,
                             metric='minkowski').fit(X_train_std, y_train),
        KNeighborsClassifier(n_neighbors=5, p=2,
                             metric='minkowski').fit(X_train_std, y_train),
        KNeighborsClassifier(n_neighbors=6, p=2,
                             metric='minkowski').fit(X_train_std, y_train),
        KNeighborsClassifier(n_neighbors=5, p=1,
                             metric='minkowski').fit(X_train_std, y_train)
    ]

    for classifier in classifiers:
        # show accuracy
        y_pred = classifier.predict(X_test_std)
        print('misclassified samples: {}'.format(np.sum(y_test != y_pred)))

        # show decision regions
        plot_decision_regions(X_combined_std,
                              y_combined,
                              classifier=classifier,
                              test_idx=list(range(len(y_train), len(y))))
示例#6
0
def main():
    # prepare sample data and target variable
    wine_data = WineData()
    X = wine_data.X
    y = wine_data.y

    # split sample data into training data and test data and standardize them
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=0,
                                                        stratify=y)
    sc = StandardScaler().fit(X_train)
    X_train_std = sc.transform(X_train)
    X_test_std = sc.transform(X_test)

    lda_transformers = [
        lda.LDA(n_components=2),
        LinearDiscriminantAnalysis(n_components=2)
    ]
    for lda_transformer in lda_transformers:
        # execute LDA
        X_train_lda = lda_transformer.fit_transform(X_train_std, y_train)

        # show coefficients and explained variance ratio
        print('coefficients:\n', lda_transformer.coef_)
        print('explained variance ratio:',
              lda_transformer.explained_variance_ratio_)
        plot_features(X_train_lda, y_train, xlabel='LD1', ylabel='LD2')

        # fit classifier and plot decigion regions
        classifier = LogisticRegression(C=100.0,
                                        random_state=1,
                                        solver='liblinear',
                                        multi_class='ovr').fit(
                                            X_train_lda, y_train)
        X_test_lda = lda_transformer.transform(X_test_std)
        print('score: ', classifier.score(X_test_lda, y_test))
        plot_decision_regions(X_test_lda,
                              y_test,
                              classifier=classifier,
                              xlabel='LD1',
                              ylabel='LD2')
def main():
    # prepare training data and target variable
    features = ['sepal length (cm)', 'petal length (cm)']
    labels = ['setosa', 'versicolor']
    D = IrisData(features, labels)
    X = D.X
    y = np.where(D.y == 0, -1, 1)

    # fit perceptron
    classifier = Perceptron(eta=0.1, n_iter=10)
    classifier.fit(X, y)

    # show history of errors
    plot_update_history(classifier)

    # show decision regions
    plot_decision_regions(X,
                          y,
                          classifier=classifier,
                          xlabel='sepal length [cm]',
                          ylabel='petal lnegth [cm]')
示例#8
0
def main():
    # prepare sample data and target variable
    wine_data = WineData()
    X = wine_data.X
    y = wine_data.y

    # split sample data into training data and test data and standardize them
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=0,
                                                        stratify=y)
    sc = StandardScaler().fit(X_train)
    X_train_std = sc.transform(X_train)
    X_test_std = sc.transform(X_test)

    pca_transformers = [pca.PCA(n_components=2), PCA(n_components=2)]
    for pca_transformer in pca_transformers:
        # execute PCA
        X_train_pca = pca_transformer.fit_transform(X_train_std)

        # show principal components and explained variance
        print('principal components:\n', pca_transformer.components_)
        print('explained variance:', pca_transformer.explained_variance_)
        plot_features(X_train_pca, y_train, xlabel='PC1', ylabel='PC2')

        # fit classifier and plot decigion regions
        classifier = LogisticRegression(C=100.0,
                                        random_state=1,
                                        solver='liblinear',
                                        multi_class='ovr').fit(
                                            X_train_pca, y_train)
        X_test_pca = pca_transformer.transform(X_test_std)
        print('score: ', classifier.score(X_test_pca, y_test))
        plot_decision_regions(X_test_pca,
                              y_test,
                              classifier=classifier,
                              xlabel='PC1',
                              ylabel='PC2')
示例#9
0
def main():
    # prepare training data and target variable
    features = ['sepal length (cm)', 'petal length (cm)']
    labels = ['setosa', 'versicolor']
    D = IrisData(features, labels)
    X = D.X
    y = np.where(D.y == 0, -1, 1)

    # standardize training data
    X_std = np.copy(X)
    for i in range(len(labels)):
        X_std[:, i] = (X[:, i] - X[:, i].mean()) / X[:, i].std()

    # fit classifiers
    classifiers = [
        AdalineGD(eta=0.01, n_iter=10).fit(X, y),
        AdalineGD(eta=0.0001, n_iter=10).fit(X, y),
        AdalineGD(eta=0.01, n_iter=15).fit(X_std, y),
        AdalineSGD(eta=0.01, n_iter=15).fit(X_std, y)
    ]

    # show history of costs
    for classifier in classifiers:
        plot_update_history(classifier)

    # show decision regions
    plot_decision_regions(X_std,
                          y,
                          classifier=classifiers[2],
                          xlabel='sepal length [standardized]',
                          ylabel='petal lnegth [standardized]')
    plot_decision_regions(X_std,
                          y,
                          classifier=classifiers[3],
                          xlabel='sepal length [standardized]',
                          ylabel='petal lnegth [standardized]')
示例#10
0
def main():
    # prepare sample data and target variable
    labels = ['versicolor', 'virginica']
    features = ['sepal width (cm)', 'petal length (cm)']
    D = IrisData(features, labels)
    X = D.X
    y = D.y

    # split sample data into training data and test data and standardize them
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.5,
                                                        random_state=1,
                                                        stratify=y)
    sc = StandardScaler().fit(X_train)
    X_train_std = sc.transform(X_train)
    X_test_std = sc.transform(X_test)

    # combine training data and test data
    X_combined_std = np.vstack((X_train_std, X_test_std))
    y_combined = np.hstack((y_train, y_test))

    # prepare classifiers
    logistic_regression = LogisticRegression(penalty='l2',
                                             solver='liblinear',
                                             C=0.001,
                                             random_state=1)
    decision_tree = DecisionTreeClassifier(criterion='entropy',
                                           max_depth=1,
                                           random_state=0)
    knn = KNeighborsClassifier(n_neighbors=1, p=2, metric='minkowski')
    majority_vote = MajorityVoteClassifier(
        classifiers=[logistic_regression, decision_tree, knn])
    classifiers = [logistic_regression, decision_tree, knn, majority_vote]
    classifier_names = [
        'logistic regression', 'decision tree', 'KNN', 'majority vote'
    ]

    # compute cross validation score of classifiers
    for classifier, name in zip(classifiers, classifier_names):
        scores = cross_val_score(estimator=classifier,
                                 X=X_train_std,
                                 y=y_train,
                                 cv=10,
                                 scoring='accuracy')
        print('accuracy : {mean:f} +/- {std:f} ({name})'.format(
            mean=np.mean(scores), std=np.std(scores), name=name))

    # execute grid search
    param_grid = {
        'decisiontreeclassifier__max_depth': [1, 2],
        'logisticregression__C': [0.001, 0.1, 100.0]
    }
    grid = GridSearchCV(estimator=majority_vote,
                        param_grid=param_grid,
                        cv=10,
                        scoring='accuracy').fit(X_train_std, y_train)
    for mean, std, params in zip(grid.cv_results_['mean_test_score'],
                                 grid.cv_results_['std_test_score'],
                                 grid.cv_results_['params']):
        print('accuracy: {mean:f} +/- {std:f} with {params}'.format(
            mean=mean, std=std, params=params))

    # plot ROC curves
    pos_label_index = 1
    for classifier, name in zip(classifiers, classifier_names):
        y_pred = classifier.fit(
            X_train_std, y_train).predict_proba(X_test_std)[:, pos_label_index]
        fpr, tpr, _ = roc_curve(y_test, y_pred, pos_label=pos_label_index + 1)
        plt.plot(fpr,
                 tpr,
                 label='{name} (AUC = {auc:f})'.format(name=name,
                                                       auc=auc(fpr, tpr)))
    plt.grid(alpha=0.5)
    plt.xlabel('false positive rate')
    plt.ylabel('true positive rate')
    plt.legend(loc='lower right')
    plt.show()

    # plot decision regions
    for classifier in classifiers:
        classifier.fit(X_train_std, y_train)
        plot_decision_regions(X_combined_std,
                              y_combined,
                              classifier=classifier,
                              test_idx=list(range(len(y_train), len(y))))