Exemplo n.º 1
0
def load_german():
    from sklearn.preprocessing import LabelEncoder, OneHotEncoder
    g = pd.read_csv("./datasets/german/german.data.txt", header=-1, sep='\s+')
    g = g.as_matrix()
    g = np.array(g, dtype='str')
    g = LabelEncoder().fit_transform(g.ravel()).reshape(*g.shape)
    list_of_cat = [0, 2, 3, 5, 6, 8, 9, 11, 13, 14, 16, 18]
    for i in range(len(g[1, :])):
        if len(set(g[:, i])) > 2:
            list_of_cat.append(i)
    val19_0 = np.min(g[:, 19])  # Foreign\not foreign feature
    val19_1 = np.max(g[:, 19])
    for idx, ex in enumerate(g):
        g[idx, 19] = -1.0 if g[idx, 19] == val19_0 else 1.0
    list_of_cat = sorted(list(set(list_of_cat)))
    enc = OneHotEncoder(n_values='auto',
                        categorical_features=list_of_cat,
                        sparse=False,
                        handle_unknown='error')
    enc.fit(g)
    g = enc.transform(g)
    ytrue_value = g[0, -1]
    y = -np.array([1.0 if yy == ytrue_value else -1.0 for yy in g[:, -1]])
    x = g[:, :-1]
    dataset = namedtuple('_', 'data, target')(x, y)
    return dataset
Exemplo n.º 2
0
    def getAnova(self, X, y):

        # y = y[:200]
        # X = X[:200]
        X = LabelEncoder().fit_transform(X.ravel()).reshape(*X.shape)
        # transform to binary
        # X = OneHotEncoder().fit_transform(X_int).toarray()

        n_samples = len(y)
        X = X.reshape((n_samples, -1))
        # add 200 non-informative features
        X = np.hstack((X, 2 * np.random.random((n_samples, 200))))

        transform = feature_selection.SelectPercentile(
            feature_selection.f_classif)

        clf = Pipeline([('anova', transform), ('svc', svm.SVC(C=1.0))])

        # #############################################################################
        # Plot the cross-validation score as a function of percentile of features
        score_means = list()
        score_stds = list()
        percentiles = (5, 10, 20, 40, 60, 80, 100)

        for percentile in percentiles:
            clf.set_params(anova__percentile=percentile)
            # Compute cross-validation score using 1 CPU
            this_scores = cross_val_score(clf,
                                          X,
                                          y,
                                          n_jobs=1,
                                          verbose=10,
                                          cv=3)
            score_means.append(this_scores.mean())
            score_stds.append(this_scores.std())

        plt.errorbar(percentiles, score_means, np.array(score_stds))

        plt.title(
            'Performance of the SVM-Anova varying the percentile of features selected'
        )
        plt.xlabel('Percentile')
        plt.ylabel('Prediction rate')

        plt.axis('tight')
        plt.show()
Exemplo n.º 3
0
    x1_max, x2_max = x.max()
    t1 = np.linspace(x1_min, x1_max, N)
    t2 = np.linspace(x2_min, x2_max, M)
    x1, x2 = np.meshgrid(t1, t2)  # 生成网格采样点
    x_show = np.stack((x1.flat, x2.flat), axis=1)  # 测试点
    print x_show.shape

    cm_light = mpl.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF'])
    cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b'])
    y_show_hat = model.predict(x_show)  # 预测值
    y_show_hat = y_show_hat.reshape(x1.shape)  # 使之与输入的形状相同
    plt.figure(facecolor='w')
    plt.pcolormesh(x1, x2, y_show_hat, cmap=cm_light)  # 预测值的显示
    plt.contour(x1, x2, y_show_hat, levels=(0,1), colors='r', linestyles='-.')
    plt.scatter(x_test[0], x_test[1], c=y_test.ravel(), edgecolors='k', s=150, zorder=10, cmap=cm_dark, marker='*')  # 测试数据
    plt.scatter(x[0], x[1], c=y.ravel(), edgecolors='k', s=40, cmap=cm_dark)  # 全部数据
    plt.xlabel(iris_feature[0], fontsize=15)
    plt.ylabel(iris_feature[1], fontsize=15)
    plt.xlim(x1_min, x1_max)
    plt.ylim(x2_min, x2_max)
    plt.grid(True)
    plt.title(u'鸢尾花数据的决策树分类', fontsize=17)
    plt.show()

    # 训练集上的预测结果
    y_test = y_test.reshape(-1)
    print y_test_hat
    print y_test
    result = (y_test_hat == y_test)   # True则预测正确,False则预测错误
    acc = np.mean(result)
    print u'准确度: %.2f%%' % (100 * acc)
Exemplo n.º 4
0
    y_show_hat = model.predict(x_show)  # 预测值
    print(y_show_hat.shape)
    print(y_show_hat)
    y_show_hat = y_show_hat.reshape(x1.shape)  # 使之与输入的形状相同
    print(y_show_hat)
    plt.figure(facecolor='w')
    plt.pcolormesh(x1, x2, y_show_hat, cmap=cm_light)  # 预测值的显示
    plt.scatter(x_test[0],
                x_test[1],
                c=y_test.ravel(),
                edgecolors='k',
                s=100,
                zorder=10,
                cmap=cm_dark,
                marker='*')  # 测试数据
    plt.scatter(x[0], x[1], c=y.ravel(), edgecolors='k', s=20,
                cmap=cm_dark)  # 全部数据
    plt.xlabel(iris_feature[0], fontsize=13)
    plt.ylabel(iris_feature[1], fontsize=13)
    plt.xlim(x1_min, x1_max)
    plt.ylim(x2_min, x2_max)
    plt.grid(b=True, ls=':', color='#606060')
    plt.title('鸢尾花数据的决策树分类', fontsize=15)
    plt.show()

    # 训练集上的预测结果
    y_test = y_test.reshape(-1)
    print(y_test_hat)
    print(y_test)
    result = (y_test_hat == y_test)  # True则预测正确,False则预测错误
    acc = np.mean(result)
Exemplo n.º 5
0
                score_best = score
        print('p: {}, {}, {}'.format(p_lower, p, p_upper))
        print('score: {}'.format(score_best))
        print()
    return AffinityPropagation(preference=p_best).fit(y)


if __name__ == '__main__':
    y_train = np.load('y_train.npy')
    c_train = np.load('c_train.npy').ravel()
    y_test = np.load('y_test.npy')
    c_test = np.load('c_test.npy').ravel()

    c_train = LabelEncoder().fit_transform(c_train)
    c_test = LabelEncoder().fit_transform(c_test)

    K = 40
    #    K = len(np.unique(c_train))
    y = y_train[c_train.ravel() < K]
    c = c_train[c_train < K]
    #    y = y_test[c_test.ravel() < K]
    #    c = c_test[c_test < K]

    ap = ap_cluster_k(y, K, preference_init=-1.0, c=c, iter_finetune=30)
    c_pred = ap.predict(y)

    print(normalized_mutual_info_score(c, c_pred))
    plt.plot(np.vstack((c_pred, c)).T)
    plt.show()
#    print f1_score(c, c_pred)
    t2 = np.linspace(x2_min, x2_max, M)
    x1, x2 = np.meshgrid(t1, t2)  # 生成网格采样点
    x_show = np.stack((x1.flat, x2.flat), axis=1)  # 测试点
    print x_show.shape

    cm_light = mpl.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF'])
    cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b'])
    y_show_hat = model.predict(x_show)  # 预测值
    print y_show_hat.shape
    print y_show_hat
    y_show_hat = y_show_hat.reshape(x1.shape)  # 使之与输入的形状相同
    print y_show_hat
    plt.figure(facecolor='w')
    plt.pcolormesh(x1, x2, y_show_hat, cmap=cm_light)  # 预测值的显示
    plt.scatter(x_test[0], x_test[1], c=y_test.ravel(), edgecolors='k', s=150, zorder=10, cmap=cm_dark, marker='*')  # 测试数据
    plt.scatter(x[0], x[1], c=y.ravel(), edgecolors='k', s=40, cmap=cm_dark)  # 全部数据
    plt.xlabel(iris_feature[0], fontsize=15)
    plt.ylabel(iris_feature[1], fontsize=15)
    plt.xlim(x1_min, x1_max)
    plt.ylim(x2_min, x2_max)
    plt.grid(True)
    plt.title(u'鸢尾花数据的决策树分类', fontsize=17)
    plt.show()

    # 训练集上的预测结果
    y_test = y_test.reshape(-1)
    print y_test_hat
    print y_test
    result = (y_test_hat == y_test)   # True则预测正确,False则预测错误
    acc = np.mean(result)
    print u'准确度: %.2f%%' % (100 * acc)