예제 #1
0
def plot_decision_threshold():
    from mglearn.datasets import make_blobs
    from sklearn.svm import SVC
    try:
        from sklearn.model_selection import train_test_split
    except:
        from sklearn.cross_validation import train_test_split

    X, y = make_blobs(n_samples=(400, 50), centers=2, cluster_std=[7.0, 2],
                      random_state=22)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    fig, axes = plt.subplots(2, 3, figsize=(15, 8))
    plt.suptitle("decision_threshold")
    axes[0, 0].set_title("training data")
    axes[0, 0].scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm)

    svc = SVC(gamma=.05).fit(X_train, y_train)
    axes[0, 1].set_title("decision with threshold 0")
    axes[0, 1].scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm)
    plot_2d_scores(svc, X_train, function="decision_function", alpha=.7,
                   ax=axes[0, 1])
    plot_2d_separator(svc, X_train, linewidth=3, ax=axes[0, 1])
    axes[0, 2].set_title("decision with threshold -0.8")
    axes[0, 2].scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm)
    plot_2d_separator(svc, X_train, linewidth=3, ax=axes[0, 2], threshold=-.8)
    plot_2d_scores(svc, X_train, function="decision_function", alpha=.7,
                   ax=axes[0, 2])

    axes[1, 0].set_visible(False)

    mask = np.abs(X_train[:, 1] - 7) < 5
    bla = np.sum(mask)

    line = np.linspace(X_train.min(), X_train.max(), 100)
    axes[1, 1].set_title("Cross-section with threshold 0")
    axes[1, 1].plot(line, svc.decision_function(np.c_[line, 10 * np.ones(100)]), c='k')
    contour = (svc.decision_function(np.c_[line, 10 * np.ones(100)]) > 0).reshape(1, -1).repeat(10, axis=0)
    axes[1, 1].contourf(line, np.linspace(-1.5, 1.5, 10), contour, alpha=0.2, cmap=cm)
    axes[1, 1].scatter(X_train[mask, 0], np.zeros(bla), c=y_train[mask], cmap=cm, alpha=.1, s=100)
    axes[1, 1].set_xlim(X_train.min(), X_train.max())
    axes[1, 1].set_ylim(-1.5, 1.5)
    axes[1, 1].set_xticks(())
    axes[1, 1].set_ylabel("Decision value")

    contour2 = (svc.decision_function(np.c_[line, 10 * np.ones(100)]) > -.8).reshape(1, -1).repeat(10, axis=0)
    axes[1, 2].set_title("Cross-section with threshold -0.8")
    axes[1, 2].contourf(line, np.linspace(-1.5, 1.5, 10), contour2, alpha=0.2, cmap=cm)
    axes[1, 2].scatter(X_train[mask, 0], np.zeros(bla), c=y_train[mask], cmap=cm, alpha=.1, s=100)
    axes[1, 2].plot(line, svc.decision_function(np.c_[line, 10 * np.ones(100)]), c='k')
    axes[1, 2].set_xlim(X_train.min(), X_train.max())
    axes[1, 2].set_ylim(-1.5, 1.5)
    axes[1, 2].set_xticks(())
    axes[1, 2].set_ylabel("Decision value")
def plot_decision_threshold():
    from mglearn.datasets import make_blobs
    from sklearn.svm import SVC
    from sklearn.model_selection import train_test_split

    X, y = make_blobs(n_samples=(400, 50), centers=2, cluster_std=[7.0, 2],
                      random_state=22)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    fig, axes = plt.subplots(2, 3, figsize=(15, 8), subplot_kw={'xticks': (), 'yticks': ()})
    plt.suptitle("decision_threshold")
    axes[0, 0].set_title("training data")
    discrete_scatter(X_train[:, 0], X_train[:, 1], y_train, ax=axes[0, 0])

    svc = SVC(gamma=.05).fit(X_train, y_train)
    axes[0, 1].set_title("decision with threshold 0")
    discrete_scatter(X_train[:, 0], X_train[:, 1], y_train, ax=axes[0, 1])
    plot_2d_scores(svc, X_train, function="decision_function", alpha=.7,
                   ax=axes[0, 1], cm=ReBl)
    plot_2d_separator(svc, X_train, linewidth=3, ax=axes[0, 1])
    axes[0, 2].set_title("decision with threshold -0.8")
    discrete_scatter(X_train[:, 0], X_train[:, 1], y_train, ax=axes[0, 2])
    plot_2d_separator(svc, X_train, linewidth=3, ax=axes[0, 2], threshold=-.8)
    plot_2d_scores(svc, X_train, function="decision_function", alpha=.7,
                   ax=axes[0, 2], cm=ReBl)

    axes[1, 0].set_axis_off()

    mask = np.abs(X_train[:, 1] - 7) < 5
    bla = np.sum(mask)

    line = np.linspace(X_train.min(), X_train.max(), 100)
    axes[1, 1].set_title("Cross-section with threshold 0")
    axes[1, 1].plot(line, svc.decision_function(np.c_[line, 10 * np.ones(100)]), c='k')
    dec = svc.decision_function(np.c_[line, 10 * np.ones(100)])
    contour = (dec > 0).reshape(1, -1).repeat(10, axis=0)
    axes[1, 1].contourf(line, np.linspace(-1.5, 1.5, 10), contour, alpha=0.4, cmap=cm)
    discrete_scatter(X_train[mask, 0], np.zeros(bla), y_train[mask], ax=axes[1, 1])
    axes[1, 1].set_xlim(X_train.min(), X_train.max())
    axes[1, 1].set_ylim(-1.5, 1.5)
    axes[1, 1].set_xticks(())
    axes[1, 1].set_ylabel("Decision value")

    contour2 = (dec > -.8).reshape(1, -1).repeat(10, axis=0)
    axes[1, 2].set_title("Cross-section with threshold -0.8")
    axes[1, 2].contourf(line, np.linspace(-1.5, 1.5, 10), contour2, alpha=0.4, cmap=cm)
    discrete_scatter(X_train[mask, 0], np.zeros(bla), y_train[mask], alpha=.1, ax=axes[1, 2])
    axes[1, 2].plot(line, svc.decision_function(np.c_[line, 10 * np.ones(100)]), c='k')
    axes[1, 2].set_xlim(X_train.min(), X_train.max())
    axes[1, 2].set_ylim(-1.5, 1.5)
    axes[1, 2].set_xticks(())
    axes[1, 2].set_ylabel("Decision value")
    axes[1, 0].legend(['negative class', 'positive class'])
예제 #3
0
def plot_decision_threshold():
    from mglearn.datasets import make_blobs
    from sklearn.svm import SVC
    from sklearn.model_selection import train_test_split

    X, y = make_blobs(n_samples=(400, 50), centers=2, cluster_std=[7.0, 2],
                      random_state=22)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    fig, axes = plt.subplots(2, 3, figsize=(15, 8), subplot_kw={'xticks': (), 'yticks': ()})
    plt.suptitle("decision_threshold")
    axes[0, 0].set_title("training data")
    discrete_scatter(X_train[:, 0], X_train[:, 1], y_train, ax=axes[0, 0])

    svc = SVC(gamma=.05).fit(X_train, y_train)
    axes[0, 1].set_title("decision with threshold 0")
    discrete_scatter(X_train[:, 0], X_train[:, 1], y_train, ax=axes[0, 1])
    plot_2d_scores(svc, X_train, function="decision_function", alpha=.7,
                   ax=axes[0, 1], cm=ReBl)
    plot_2d_separator(svc, X_train, linewidth=3, ax=axes[0, 1])
    axes[0, 2].set_title("decision with threshold -0.8")
    discrete_scatter(X_train[:, 0], X_train[:, 1], y_train, ax=axes[0, 2])
    plot_2d_separator(svc, X_train, linewidth=3, ax=axes[0, 2], threshold=-.8)
    plot_2d_scores(svc, X_train, function="decision_function", alpha=.7,
                   ax=axes[0, 2], cm=ReBl)

    axes[1, 0].set_axis_off()

    mask = np.abs(X_train[:, 1] - 7) < 5
    bla = np.sum(mask)

    line = np.linspace(X_train.min(), X_train.max(), 100)
    axes[1, 1].set_title("Cross-section with threshold 0")
    axes[1, 1].plot(line, svc.decision_function(np.c_[line, 10 * np.ones(100)]), c='k')
    dec = svc.decision_function(np.c_[line, 10 * np.ones(100)])
    contour = (dec > 0).reshape(1, -1).repeat(10, axis=0)
    axes[1, 1].contourf(line, np.linspace(-1.5, 1.5, 10), contour, alpha=0.4, cmap=cm)
    discrete_scatter(X_train[mask, 0], np.zeros(bla), y_train[mask], ax=axes[1, 1])
    axes[1, 1].set_xlim(X_train.min(), X_train.max())
    axes[1, 1].set_ylim(-1.5, 1.5)
    axes[1, 1].set_xticks(())
    axes[1, 1].set_ylabel("Decision value")

    contour2 = (dec > -.8).reshape(1, -1).repeat(10, axis=0)
    axes[1, 2].set_title("Cross-section with threshold -0.8")
    axes[1, 2].contourf(line, np.linspace(-1.5, 1.5, 10), contour2, alpha=0.4, cmap=cm)
    discrete_scatter(X_train[mask, 0], np.zeros(bla), y_train[mask], alpha=.1, ax=axes[1, 2])
    axes[1, 2].plot(line, svc.decision_function(np.c_[line, 10 * np.ones(100)]), c='k')
    axes[1, 2].set_xlim(X_train.min(), X_train.max())
    axes[1, 2].set_ylim(-1.5, 1.5)
    axes[1, 2].set_xticks(())
    axes[1, 2].set_ylabel("Decision value")
    axes[1, 0].legend(['negative class', 'positive class'])
예제 #4
0
def imbalanced_two_classes():
    from mglearn.datasets import make_blobs
    # X, y = make_blobs(n_samples=(350, 50), centers=[2], cluster_std=[7.0, 2], random_state=seed)
    X, y = make_blobs(n_samples=(350, 50), cluster_std=[7.0, 2], random_state=seed)

    show_title("不平衡数据的二分类问题")
    print("数据中87.5%是一类,12.5%的数据是另一类")

    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed)
    show_subtitle("测试集的标签")
    print(y_test)

    from sklearn.svm import SVC
    from sklearn.metrics import classification_report
    svc = SVC(gamma=0.05)
    svc.fit(X_train, y_train)
    predict_svc = svc.predict(X_test)
    show_subtitle("SVC分类报告")
    print(classification_report(y_test, predict_svc))

    from sklearn.metrics import confusion_matrix
    confusion = confusion_matrix(y_test, predict_svc)
    show_subtitle("SVC分类的混淆矩阵")
    print(confusion)

    # 使用决策函数可以调整数据的平衡问题,以及样本中不同类别的权重
    # 不过这种人工设置阈值的方式不是很好
    predict_svc_lower_threshold = svc.decision_function(X_test) > -0.35
    show_subtitle("SVC基于决策函数进行预测的分类报告:")
    print(classification_report(y_test, predict_svc_lower_threshold))
    # 决策函数的阈值选取,只能依靠经验,没有合适的算法

    confusion = confusion_matrix(y_test, predict_svc_lower_threshold)
    show_subtitle("SVC分类的混淆矩阵")
    print(confusion)

    mglearn.plots.plot_decision_threshold()
    plt.suptitle("图5-12:决策函数的热图与改变决策阈值的影响")
    pass
def plot_decision_threshold():
    from mglearn.datasets import make_blobs
    from sklearn.svm import SVC
    try:
        from sklearn.model_selection import train_test_split
    except:
        from sklearn.cross_validation import train_test_split

    X, y = make_blobs(n_samples=(400, 50),
                      centers=2,
                      cluster_std=[7.0, 2],
                      random_state=22)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    fig, axes = plt.subplots(2, 3, figsize=(15, 8))
    plt.suptitle("decision_threshold")
    axes[0, 0].set_title("training data")
    axes[0, 0].scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm)

    svc = SVC(gamma=.05).fit(X_train, y_train)
    axes[0, 1].set_title("decision with threshold 0")
    axes[0, 1].scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm)
    plot_2d_scores(svc,
                   X_train,
                   function="decision_function",
                   alpha=.7,
                   ax=axes[0, 1])
    plot_2d_separator(svc, X_train, linewidth=3, ax=axes[0, 1])
    axes[0, 2].set_title("decision with threshold -0.8")
    axes[0, 2].scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm)
    plot_2d_separator(svc, X_train, linewidth=3, ax=axes[0, 2], threshold=-.8)
    plot_2d_scores(svc,
                   X_train,
                   function="decision_function",
                   alpha=.7,
                   ax=axes[0, 2])

    axes[1, 0].set_visible(False)

    mask = np.abs(X_train[:, 1] - 7) < 5
    bla = np.sum(mask)

    line = np.linspace(X_train.min(), X_train.max(), 100)
    axes[1, 1].set_title("Cross-section with threshold 0")
    axes[1, 1].plot(line,
                    svc.decision_function(np.c_[line, 10 * np.ones(100)]),
                    c='k')
    contour = (svc.decision_function(np.c_[line, 10 * np.ones(100)]) >
               0).reshape(1, -1).repeat(10, axis=0)
    axes[1, 1].contourf(line,
                        np.linspace(-1.5, 1.5, 10),
                        contour,
                        alpha=0.2,
                        cmap=cm)
    axes[1, 1].scatter(X_train[mask, 0],
                       np.zeros(bla),
                       c=y_train[mask],
                       cmap=cm,
                       alpha=.1,
                       s=100)
    axes[1, 1].set_xlim(X_train.min(), X_train.max())
    axes[1, 1].set_ylim(-1.5, 1.5)
    axes[1, 1].set_xticks(())
    axes[1, 1].set_ylabel("Decision value")

    contour2 = (svc.decision_function(np.c_[line, 10 * np.ones(100)]) >
                -.8).reshape(1, -1).repeat(10, axis=0)
    axes[1, 2].set_title("Cross-section with threshold -0.8")
    axes[1, 2].contourf(line,
                        np.linspace(-1.5, 1.5, 10),
                        contour2,
                        alpha=0.2,
                        cmap=cm)
    axes[1, 2].scatter(X_train[mask, 0],
                       np.zeros(bla),
                       c=y_train[mask],
                       cmap=cm,
                       alpha=.1,
                       s=100)
    axes[1, 2].plot(line,
                    svc.decision_function(np.c_[line, 10 * np.ones(100)]),
                    c='k')
    axes[1, 2].set_xlim(X_train.min(), X_train.max())
    axes[1, 2].set_ylim(-1.5, 1.5)
    axes[1, 2].set_xticks(())
    axes[1, 2].set_ylabel("Decision value")
예제 #6
0
from mglearn.datasets import make_blobs
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X, y = make_blobs(n_samples=(400, 50),
                  centers=2,
                  cluster_std=[7.0, 2],
                  random_state=22)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

svc = SVC(gamma=0.05)
svc.fit(X_train, y_train)
print('Default:')
print(classification_report(y_test, svc.predict(X_test)))

y_pred_lower_threshold = svc.decision_function(X_test) > -0.8
print('Lower Threshold:')
print(classification_report(y_test, y_pred_lower_threshold))
예제 #7
0
def compare_roc_curve():
    from sklearn.datasets import make_blobs
    X, y = make_blobs(n_samples=(4000, 500), n_features=2, cluster_std=[7.0, 2], random_state=22)

    show_title("使用 ROC 曲线分析不平衡的数据对模型的影响")
    print("数据中87.5%是一类,12.5%的数据是另一类")

    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed)
    show_subtitle("测试集的标签")
    print(y_test)

    from sklearn.svm import SVC
    svc = SVC(gamma=0.05)
    svc.fit(X_train, y_train)

    from sklearn.metrics import roc_curve
    fpr, tpr, thresholds = roc_curve(y_test, svc.decision_function(X_test))

    # 找到最接近于0的阈值的位置
    close_zero = np.argmin(np.abs(thresholds))

    plt.plot(fpr[close_zero], tpr[close_zero], 'o',
             markersize=10, label="SVC的0阈值",
             fillstyle='none', c='k', mew=2)
    plt.plot(fpr, tpr, label="ROC Curve")
    plt.xlabel('FPR')
    plt.ylabel('TPR(recall)')
    plt.legend()
    plt.title("图5-15:SVC(gamma=0.05)的ROC曲线\n"
              "曲线越靠近左上角,则分类器越好")

    from sklearn.ensemble import RandomForestClassifier
    rf = RandomForestClassifier(n_estimators=100, random_state=0, max_features=2)
    rf.fit(X_train, y_train)

    # RandomForestClassifier有predict_proba,但是没有decision_function
    fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, rf.predict_proba(X_test)[:, 1])
    close_default_rf = np.argmin(np.abs(thresholds_rf - 0.5))

    plt.figure()
    plt.plot(fpr[close_zero], tpr[close_zero], 'o',
             markersize=10, label="SVC的0阈值",
             fillstyle='none', c='k', mew=2)
    plt.plot(fpr, tpr, label="ROC Curve SVC")

    plt.plot(fpr_rf[close_default_rf], tpr_rf[close_default_rf], '^',
             markersize=10, label="随机森林的0.5阈值",
             fillstyle='none', c='k', mew=2)
    plt.plot(fpr_rf, tpr_rf, label="ROC Curve RF")

    plt.xlabel('FPR(假真类率)')
    plt.ylabel('TPR(真真类率)')
    plt.legend()
    plt.title("图5-16:比较 SVM 和 随机森林 的 ROC曲线\n"
              "曲线越靠近左上角,则分类器越好\n"
              "即假真类率(FPR)要低,真真类率(TPR)要高")

    # 对于不平衡数据集的分类问题,AUC指标比精度指标的效果更好。
    # 分别随机从样本集中抽取一个正样本和一个负样本,正样本的预测值大于负样本的预测值的概率。
    show_subtitle("AUC 表示曲线下的积分(即面积),解释为评估正例样本的排名")
    from sklearn.metrics import roc_auc_score
    svc_auc = roc_auc_score(y_test, svc.decision_function(X_test))
    print("SVC的AUC:{:.3f}".format(svc_auc))
    rf_auc = roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1])
    print("随机森林的AUC:{:.3f}".format(rf_auc))
    print("对于不平衡类别的分类问题,选择模型时使用 AUC 比 精度 更有意义")
    pass
예제 #8
0
def compare_precision_recall_curve():
    from sklearn.datasets import make_blobs
    X, y = make_blobs(n_samples=(4000, 500), n_features=2, cluster_std=[7.0, 2], random_state=22)

    show_title("使用“准确率——召回率曲线”分析不平衡的数据对模型的影响")
    print("数据中87.5%是一类,12.5%的数据是另一类")

    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    show_subtitle("测试集的标签")
    print(y_test)

    from sklearn.svm import SVC
    svc = SVC(gamma=0.05)
    svc.fit(X_train, y_train)

    from sklearn.metrics import precision_recall_curve
    precision_svc, recall_svc, thresholds_svc = precision_recall_curve(y_test, svc.decision_function(X_test))

    # 找到最接近于0的阈值的位置
    close_zero = np.argmin(np.abs(thresholds_svc))
    plt.plot(precision_svc[close_zero], recall_svc[close_zero], 'o',
             markersize=10, label="SVC的0阈值",
             fillstyle='none', c='k', mew=2)
    plt.plot(precision_svc, recall_svc, label="准确率——召回率曲线")
    plt.xlabel('准确率')
    plt.ylabel('召回率')
    plt.legend()
    plt.suptitle("图5-13:SVC(gamma=0.05)的准确率--召回率曲线\n"
                 "曲线越靠近右上角,则分类器越好")

    from sklearn.ensemble import RandomForestClassifier
    rf = RandomForestClassifier(n_estimators=100, random_state=0, max_features=2)
    rf.fit(X_train, y_train)

    # RandomForestClassifier 有预测概率(predict_proba),但是没有决策函数(decision_function)
    precision_rf, recall_rf, thresholds_rf = precision_recall_curve(y_test, rf.predict_proba(X_test)[:, 1])

    plt.figure()
    plt.plot(precision_svc, recall_svc, label="SVC")
    plt.plot(precision_svc[close_zero], recall_svc[close_zero], 'o',
             markersize=10, label="SVC的0阈值",
             fillstyle='none', c='k', mew=2)

    close_default_rf = np.argmin(np.abs(thresholds_rf - 0.5))
    plt.plot(precision_rf, recall_rf, label="随机森林")
    plt.plot(precision_rf[close_default_rf], recall_rf[close_default_rf], '^',
             markersize=10, label="随机森林的0.5阈值",
             fillstyle='none', mew=2)
    plt.xlabel('准确率')
    plt.ylabel('召回率')
    plt.legend()
    plt.title("图5-14:比较 SVM 与 随机森林 的 准确率--召回率曲线\n"
              "SVM在中间位置的表现更好\n"
              "随机森林在极值处表现更好(即极值处的精度或是高准确率或是高召回率)")

    show_subtitle("f1_score表示了准确率——召回率曲线上默认阈值对应的点")
    from sklearn.metrics import f1_score
    predict_svc = svc.predict(X_test)
    print("SVC的f1_score: {:.3f}".format(f1_score(y_test, predict_svc)))
    predict_rf = rf.predict(X_test)
    print("随机森林的f1_score: {:.3f}".format(f1_score(y_test, predict_rf)))

    show_subtitle("平均准确率(Average Precision)表示曲线下的积分(即面积)")
    from sklearn.metrics import average_precision_score
    ap_svc = average_precision_score(y_test, svc.decision_function(X_test))
    print("SVC的平均准确率:{:.3f}".format(ap_svc))
    ap_rf = average_precision_score(y_test, rf.predict_proba(X_test)[:, 1])
    print("随机森林的平均准确率:{:.3f}".format(ap_rf))

    pass
import matplotlib.pyplot as plt
import mglearn
import numpy as np
import pandas as pd

from mglearn.datasets import make_blobs
from sklearn.svm import LinearSVC

X, y = make_blobs(centers=4, random_state=8)
y = y % 2

linear_svm = LinearSVC().fit(X, y)
'''
mglearn.plots.plot_2d_separator(linear_svm,X)
mglearn.discrete_scatter(X[:,0],X[:,1],y)
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")
'''
X_new = np.hstack([X, X[:, 1:]**2])

from mpl_toolkits.mplot3d import Axes3D, axes3d
figure = plt.figure()

ax = Axes3D(figure, elev=-152, azim=-26)
mask = y == 0
ax.scatter(X_new[mask, 0],
           X_new[mask, 1],
           X_new[mask, 2],
           c='b',
           cmap=mglearn.cm2,
           s=60)
예제 #10
0
import numpy as np
import mglearn
import matplotlib.pyplot as plt
from mglearn.datasets import make_blobs
from sklearn.svm import LinearSVC

X, y = make_blobs(random_state=42, centers=3)

linear_svc = LinearSVC().fit(X, y)
print('Coefficient Shape', linear_svc.coef_.shape)
print('Intercept Shape', linear_svc.intercept_.shape)


mglearn.plots.plot_2d_classification(linear_svc, X, fill=True, alpha=.6)
mglearn.discrete_scatter(X[:, 0], X[:, 1], y)
line = np.linspace(-15, 15)

for coef, intercept, color in zip(linear_svc.coef_, linear_svc.intercept_, ['b', 'r', 'g']):
    plt.plot(line, -(line * coef[0] + intercept) / coef[1], c=color)

plt.ylim(-10, 15)
plt.xlim(-10, 8)
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')
plt.legend(['Class 0', 'Class 1', 'Class 2', 'Line Class 1', 'Line Class 2', 'Line Class 3'], loc=(1.01, 0.3))
plt.show()
import matplotlib.pyplot as plt
import mglearn
import numpy as np
import pandas as pd

#mglearn.plots.plot_agglomerative()

from mglearn.datasets import make_blobs

from scipy.cluster.hierarchy import dendrogram, ward

X, y = make_blobs(random_state=0, n_samples=12)
linkage_array = ward(X)
dendrogram(linkage_array)

ax = plt.gca()
bounds = ax.get_xbound()
ax.plot(bounds, [7.25, 7.25], '--', c='k')
ax.plot(bounds, [4, 4], '--', c='k')

ax.text(bounds[1], 7.25, 'two clusters', va='center', fontdict={'size': 15})
ax.text(bounds[1], 4, 'threeclusters', va='center', fontdict={'size': 15})
plt.xlabel("Sample index")
plt.ylabel("Cluster distance")
예제 #12
0
Chapter 2
Supervised Learning - Kernelized Support Vector Machines
"""

import matplotlib.pyplot as plt
import mglearn.datasets
from mpl_toolkits import mplot3d
import numpy as np
from sklearn import datasets
from sklearn import model_selection
from sklearn import svm

"""
Linear models and nonlinear features
"""
x, y = datasets.make_blobs(centers=4, random_state=8)
y = y % 2

mglearn.discrete_scatter(x[:, 0], x[:, 1], y)
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')

linear_svm = svm.LinearSVC().fit(x, y)

# decision boundary found by a linear SVM
mglearn.plots.plot_2d_separator(linear_svm, x)
mglearn.discrete_scatter(x[:, 0], x[:, 1], y)
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')

# add a third feature derived from feature 1