def load_german(): from sklearn.preprocessing import LabelEncoder, OneHotEncoder g = pd.read_csv("./datasets/german/german.data.txt", header=-1, sep='\s+') g = g.as_matrix() g = np.array(g, dtype='str') g = LabelEncoder().fit_transform(g.ravel()).reshape(*g.shape) list_of_cat = [0, 2, 3, 5, 6, 8, 9, 11, 13, 14, 16, 18] for i in range(len(g[1, :])): if len(set(g[:, i])) > 2: list_of_cat.append(i) val19_0 = np.min(g[:, 19]) # Foreign\not foreign feature val19_1 = np.max(g[:, 19]) for idx, ex in enumerate(g): g[idx, 19] = -1.0 if g[idx, 19] == val19_0 else 1.0 list_of_cat = sorted(list(set(list_of_cat))) enc = OneHotEncoder(n_values='auto', categorical_features=list_of_cat, sparse=False, handle_unknown='error') enc.fit(g) g = enc.transform(g) ytrue_value = g[0, -1] y = -np.array([1.0 if yy == ytrue_value else -1.0 for yy in g[:, -1]]) x = g[:, :-1] dataset = namedtuple('_', 'data, target')(x, y) return dataset
def getAnova(self, X, y): # y = y[:200] # X = X[:200] X = LabelEncoder().fit_transform(X.ravel()).reshape(*X.shape) # transform to binary # X = OneHotEncoder().fit_transform(X_int).toarray() n_samples = len(y) X = X.reshape((n_samples, -1)) # add 200 non-informative features X = np.hstack((X, 2 * np.random.random((n_samples, 200)))) transform = feature_selection.SelectPercentile( feature_selection.f_classif) clf = Pipeline([('anova', transform), ('svc', svm.SVC(C=1.0))]) # ############################################################################# # Plot the cross-validation score as a function of percentile of features score_means = list() score_stds = list() percentiles = (5, 10, 20, 40, 60, 80, 100) for percentile in percentiles: clf.set_params(anova__percentile=percentile) # Compute cross-validation score using 1 CPU this_scores = cross_val_score(clf, X, y, n_jobs=1, verbose=10, cv=3) score_means.append(this_scores.mean()) score_stds.append(this_scores.std()) plt.errorbar(percentiles, score_means, np.array(score_stds)) plt.title( 'Performance of the SVM-Anova varying the percentile of features selected' ) plt.xlabel('Percentile') plt.ylabel('Prediction rate') plt.axis('tight') plt.show()
x1_max, x2_max = x.max() t1 = np.linspace(x1_min, x1_max, N) t2 = np.linspace(x2_min, x2_max, M) x1, x2 = np.meshgrid(t1, t2) # 生成网格采样点 x_show = np.stack((x1.flat, x2.flat), axis=1) # 测试点 print x_show.shape cm_light = mpl.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF']) cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b']) y_show_hat = model.predict(x_show) # 预测值 y_show_hat = y_show_hat.reshape(x1.shape) # 使之与输入的形状相同 plt.figure(facecolor='w') plt.pcolormesh(x1, x2, y_show_hat, cmap=cm_light) # 预测值的显示 plt.contour(x1, x2, y_show_hat, levels=(0,1), colors='r', linestyles='-.') plt.scatter(x_test[0], x_test[1], c=y_test.ravel(), edgecolors='k', s=150, zorder=10, cmap=cm_dark, marker='*') # 测试数据 plt.scatter(x[0], x[1], c=y.ravel(), edgecolors='k', s=40, cmap=cm_dark) # 全部数据 plt.xlabel(iris_feature[0], fontsize=15) plt.ylabel(iris_feature[1], fontsize=15) plt.xlim(x1_min, x1_max) plt.ylim(x2_min, x2_max) plt.grid(True) plt.title(u'鸢尾花数据的决策树分类', fontsize=17) plt.show() # 训练集上的预测结果 y_test = y_test.reshape(-1) print y_test_hat print y_test result = (y_test_hat == y_test) # True则预测正确,False则预测错误 acc = np.mean(result) print u'准确度: %.2f%%' % (100 * acc)
y_show_hat = model.predict(x_show) # 预测值 print(y_show_hat.shape) print(y_show_hat) y_show_hat = y_show_hat.reshape(x1.shape) # 使之与输入的形状相同 print(y_show_hat) plt.figure(facecolor='w') plt.pcolormesh(x1, x2, y_show_hat, cmap=cm_light) # 预测值的显示 plt.scatter(x_test[0], x_test[1], c=y_test.ravel(), edgecolors='k', s=100, zorder=10, cmap=cm_dark, marker='*') # 测试数据 plt.scatter(x[0], x[1], c=y.ravel(), edgecolors='k', s=20, cmap=cm_dark) # 全部数据 plt.xlabel(iris_feature[0], fontsize=13) plt.ylabel(iris_feature[1], fontsize=13) plt.xlim(x1_min, x1_max) plt.ylim(x2_min, x2_max) plt.grid(b=True, ls=':', color='#606060') plt.title('鸢尾花数据的决策树分类', fontsize=15) plt.show() # 训练集上的预测结果 y_test = y_test.reshape(-1) print(y_test_hat) print(y_test) result = (y_test_hat == y_test) # True则预测正确,False则预测错误 acc = np.mean(result)
score_best = score print('p: {}, {}, {}'.format(p_lower, p, p_upper)) print('score: {}'.format(score_best)) print() return AffinityPropagation(preference=p_best).fit(y) if __name__ == '__main__': y_train = np.load('y_train.npy') c_train = np.load('c_train.npy').ravel() y_test = np.load('y_test.npy') c_test = np.load('c_test.npy').ravel() c_train = LabelEncoder().fit_transform(c_train) c_test = LabelEncoder().fit_transform(c_test) K = 40 # K = len(np.unique(c_train)) y = y_train[c_train.ravel() < K] c = c_train[c_train < K] # y = y_test[c_test.ravel() < K] # c = c_test[c_test < K] ap = ap_cluster_k(y, K, preference_init=-1.0, c=c, iter_finetune=30) c_pred = ap.predict(y) print(normalized_mutual_info_score(c, c_pred)) plt.plot(np.vstack((c_pred, c)).T) plt.show() # print f1_score(c, c_pred)
t2 = np.linspace(x2_min, x2_max, M) x1, x2 = np.meshgrid(t1, t2) # 生成网格采样点 x_show = np.stack((x1.flat, x2.flat), axis=1) # 测试点 print x_show.shape cm_light = mpl.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF']) cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b']) y_show_hat = model.predict(x_show) # 预测值 print y_show_hat.shape print y_show_hat y_show_hat = y_show_hat.reshape(x1.shape) # 使之与输入的形状相同 print y_show_hat plt.figure(facecolor='w') plt.pcolormesh(x1, x2, y_show_hat, cmap=cm_light) # 预测值的显示 plt.scatter(x_test[0], x_test[1], c=y_test.ravel(), edgecolors='k', s=150, zorder=10, cmap=cm_dark, marker='*') # 测试数据 plt.scatter(x[0], x[1], c=y.ravel(), edgecolors='k', s=40, cmap=cm_dark) # 全部数据 plt.xlabel(iris_feature[0], fontsize=15) plt.ylabel(iris_feature[1], fontsize=15) plt.xlim(x1_min, x1_max) plt.ylim(x2_min, x2_max) plt.grid(True) plt.title(u'鸢尾花数据的决策树分类', fontsize=17) plt.show() # 训练集上的预测结果 y_test = y_test.reshape(-1) print y_test_hat print y_test result = (y_test_hat == y_test) # True则预测正确,False则预测错误 acc = np.mean(result) print u'准确度: %.2f%%' % (100 * acc)