def kmeans_classification_builder(centroid_func, x_train, x_test, y_train, y_test): # plot some train data N = 25 l = int(np.ceil(np.sqrt(N))) im = np.zeros((10 * l, 10 * l)) for m in range(l): for n in range(l): if (m * l + n < N): im[10 * m:10 * m + 8, 10 * n:10 * n + 8] = x_train[m * l + n].reshape([8, 8]) plt.imsave('plots/digits.png', im, cmap='Greys') n_cluster = 10 classifier = KMeansClassifier(n_cluster=n_cluster, max_iter=100, e=1e-6) classifier.fit(x_train, y_train, centroid_func) y_hat_test = classifier.predict(x_test) # print(y_test[0]) # print(len(y_test)) assert y_hat_test.shape == y_test.shape, \ 'y_hat_test and y_test should have same shape' print( '[*] Prediction accuracy of K-means classifier with {} cluster is {}'. format(n_cluster, np.mean(y_hat_test == y_test)))
def chooseK(data_X,data_Y): best_rd=0 best_k=0 for k in range(1,100): clf = KMeansClassifier(k) clf.fit(data_X) labels = clf._labels a=0 b=0 c=0 d=0 for j in range(len(data_Y)-1,0,-1): for i in range(0,j): if data_Y[i]==data_Y[j] and labels[i]==labels[j]: a+=1 elif data_Y[i]==data_Y[j] and labels[i]!=labels[j]: b+=1 elif data_Y[i]!=data_Y[j] and labels[i]==labels[j]: c+=1 else: d+=1 rd=2*(a+d)/(len(data_Y)*(len(data_Y)-1)) print("rd ",rd,"k ",k) if rd>best_rd: best_rd=rd best_k=k return best_rd,best_k
def getkmeansresult(k, data_X, data_od): clf = KMeansClassifier(k) clf.fit(data_X, data_od) cents = clf._centroids labels = clf._labels flag = True for i in range(k): index = np.nonzero(labels == i)[0] x0 = data_X[index, 0] x1 = data_X[index, 1] y_i = i tmpsum = 0 for j in range(len(x0)): tmpl = np.math.sqrt( np.power(cents[i, 0] - x0[j], 2) + np.power(cents[i, 1] - x1[j], 2)) tmpsum += data_od[index, 0][j] if tmpl > 3000: flag = False break if tmpsum > 4000: flag = False if flag == False: break return flag, clf
def kmeans_classification(): x_train, x_test, y_train, y_test = load_digits() # plot some train data N = 25 l = int(np.ceil(np.sqrt(N))) #print(l) im = np.zeros((10 * l, 10 * l)) for m in range(l): for n in range(l): if (m * l + n < N): im[10 * m:10 * m + 8, 10 * n:10 * n + 8] = x_train[m * l + n].reshape([8, 8]) plt.imsave('plots/digits.png', im, cmap='Greys') n_cluster = 30 classifier = KMeansClassifier(n_cluster=n_cluster, max_iter=100, e=1e-6) classifier.fit(x_train, y_train) y_hat_test = classifier.predict(x_test) assert y_hat_test.shape == y_test.shape, \ 'y_hat_test and y_test should have same shape' print('Prediction accuracy of K-means classifier with {} cluster is {}'. format(n_cluster, np.mean(y_hat_test == y_test))) linear_classifier = LogisticRegression() linear_classifier.fit(x_train, y_train) y_hat_test = linear_classifier.predict(x_test) print('Accuracy of logistic regression classifier is {}'.format( np.mean(y_hat_test == y_test))) KNNClassifier = KNeighborsClassifier() KNNClassifier.fit(x_train, y_train) y_hat_test = KNNClassifier.predict(x_test) print('Accuracy of Nearest Neighbour classifier is {}'.format( np.mean(y_hat_test == y_test))) np.savez('results/k_means_classification.npz', y_hat_test=y_hat_test, y_test=y_test, centroids=classifier.centroids, centroid_labels=classifier.centroid_labels)
def fit(self, X): m = X.shape[0] self._clusterAssment = np.zeros((m, 2)) centroid0 = np.mean(X, axis=0).tolist() cenList = [centroid0] for j in range(m): # 计算每个样本点与质心之间初始的平方误差 self._clusterAssment[j, 1] = self._calEDist(np.asarray(centroid0), X[j, :])**2 while (len(cenList) < self._k): lowestSSE = np.inf for i in range(len(cenList)): index_all = self._clusterAssment[:, 0] # 取出样本所属簇的索引值 value = np.nonzero(index_all == i) # 取出所有属于第i个簇的索引点 pstInCurrCluster = X[value[0], :] # 取出属于第i个簇的所有样本点 clf = KMeansClassifier(k=2) clf.fit(pstInCurrCluster) centroidMat, splitClustAss = clf._centroids, clf._clusterAssment sseSplit = sum(splitClustAss[:, 1]) index_all = self._clusterAssment[:, 0] value = np.nonzero(index_all == i) sseNotSplit = sum(self._clusterAssment[value[0], 1]) if (sseSplit + sseNotSplit) < lowestSSE: bestCentToSplit = i bestNewCents = centroidMat bestClustAss = splitClustAss.copy() lowestSSE = sseSplit + sseNotSplit # 该簇被划分为两个子簇后,其中一个子簇的索引变为原簇的索引 # 另一个子簇的索引变为len(cenList),然后存入cenList。 bestClustAss[np.nonzero(bestClustAss[:, 0] == 1)[0], 0] = len(cenList) bestClustAss[np.nonzero(bestClustAss[:, 0] == 0)[0], 0] = bestCentToSplit cenList[bestCentToSplit] = bestNewCents[0, :].tolist() cenList.append(bestNewCents[1, :]).tolist() self._clusterAssment[np.nonzero( self._clusterAssment[:, 0] == bestCentToSplit)[0], :] = bestClustAss self._labels = self._clusterAssment[:, 0] self._sse = sum(self._clusterAssment[:, 1]) self._centroids = np.asarray(cenList)
def get_cluster(starttime, stoptime, k): ''' # la focntion qui effectue l'algorithme Kmeans # retourne sur un periode et k donne, la datafrme des sation avec leur numeros de cluster. ''' df = get_pandas_df(starttime, stoptime) df_origin = df[1] df = df[0] data_X = np.array(df).astype(np.float) clf = KMeansClassifier(k) clf.fit(data_X) centroids = clf._centroids labels = clf._labels df_origin["label_kmeans"] = labels.astype('int') plt.scatter(df['lon'], df['lat'], c=labels, s=50, alpha=0.5) plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=50) plt.show() return df_origin, centroids
""" import pandas as pd import numpy as np from kmeans import KMeansClassifier import matplotlib.pyplot as plt #加载数据集,DataFrame格式,最后将返回为一个matrix格式 def loadDataset(infile): df = pd.read_csv(infile, sep='\t', header=0, dtype=str, na_filter=False) return np.array(df).astype(np.float) if __name__=="__main__": data_X = loadDataset(r"data/testSet.txt") k = 3 clf = KMeansClassifier(k) clf.fit(data_X) cents = clf._centroids labels = clf._labels sse = clf._sse colors = ['b','g','r','k','c','m','y','#e24fff','#524C90','#845868'] for i in range(k): index = np.nonzero(labels==i)[0] x0 = data_X[index, 0] x1 = data_X[index, 1] y_i = i for j in range(len(x0)): plt.text(x0[j], x1[j], str(y_i), color=colors[i], \ fontdict={'weight': 'bold', 'size': 6}) plt.scatter(cents[i,0],cents[i,1],marker='x',color=colors[i],\ linewidths=7)
from kmeans import KMeansClassifier def load_data(path): df = pd.read_csv(path, sep="\t", header=0, dtype=str, na_filter=False) return np.array(df).astype(np.float) if __name__ == "__main__": project_dir = os.path.dirname(os.path.realpath(__file__)) data = load_data(os.path.join(project_dir, 'data', 'test.txt')) k = 3 classifier = KMeansClassifier(k) classifier.fit(data) centers = classifier._centroids labels = classifier._labels sse = classifier._sse print(labels) print(sse) colors = [ 'b', 'g', 'r', 'k', 'c', 'm', 'y', '#e24fff', '#524C90', '#845868' ] for i in range(k): index = np.nonzero(labels == i)[0] x = data[index, 0] y = data[index, 1] for j in range(len(x)): plt.text(x[j], y[j],