Пример #1
0
def kmeans_classification_builder(centroid_func, x_train, x_test, y_train,
                                  y_test):

    # plot some train data
    N = 25
    l = int(np.ceil(np.sqrt(N)))

    im = np.zeros((10 * l, 10 * l))
    for m in range(l):
        for n in range(l):
            if (m * l + n < N):
                im[10 * m:10 * m + 8,
                   10 * n:10 * n + 8] = x_train[m * l + n].reshape([8, 8])
    plt.imsave('plots/digits.png', im, cmap='Greys')

    n_cluster = 10
    classifier = KMeansClassifier(n_cluster=n_cluster, max_iter=100, e=1e-6)

    classifier.fit(x_train, y_train, centroid_func)
    y_hat_test = classifier.predict(x_test)
    #    print(y_test[0])
    #    print(len(y_test))

    assert y_hat_test.shape == y_test.shape, \
        'y_hat_test and y_test should have same shape'

    print(
        '[*] Prediction accuracy of K-means classifier with {} cluster is {}'.
        format(n_cluster, np.mean(y_hat_test == y_test)))
Пример #2
0
def chooseK(data_X,data_Y):
    best_rd=0
    best_k=0
    for k in range(1,100):
        clf = KMeansClassifier(k)
        clf.fit(data_X)
        labels = clf._labels
        a=0
        b=0
        c=0
        d=0
        for j in range(len(data_Y)-1,0,-1):
            for i in range(0,j):
                if data_Y[i]==data_Y[j] and labels[i]==labels[j]:
                    a+=1
                elif data_Y[i]==data_Y[j] and labels[i]!=labels[j]:
                    b+=1
                elif data_Y[i]!=data_Y[j] and labels[i]==labels[j]:
                    c+=1
                else:
                    d+=1
        rd=2*(a+d)/(len(data_Y)*(len(data_Y)-1))
        print("rd ",rd,"k ",k)
        if rd>best_rd:
            best_rd=rd
            best_k=k
    return best_rd,best_k
Пример #3
0
def getkmeansresult(k, data_X, data_od):
    clf = KMeansClassifier(k)
    clf.fit(data_X, data_od)
    cents = clf._centroids
    labels = clf._labels
    flag = True
    for i in range(k):
        index = np.nonzero(labels == i)[0]
        x0 = data_X[index, 0]
        x1 = data_X[index, 1]
        y_i = i
        tmpsum = 0
        for j in range(len(x0)):
            tmpl = np.math.sqrt(
                np.power(cents[i, 0] - x0[j], 2) +
                np.power(cents[i, 1] - x1[j], 2))
            tmpsum += data_od[index, 0][j]
            if tmpl > 3000:
                flag = False
                break
        if tmpsum > 4000:
            flag = False
        if flag == False:
            break
    return flag, clf
Пример #4
0
def kmeans_classification():
    x_train, x_test, y_train, y_test = load_digits()

    # plot some train data
    N = 25
    l = int(np.ceil(np.sqrt(N)))
    #print(l)

    im = np.zeros((10 * l, 10 * l))
    for m in range(l):
        for n in range(l):
            if (m * l + n < N):
                im[10 * m:10 * m + 8,
                   10 * n:10 * n + 8] = x_train[m * l + n].reshape([8, 8])
    plt.imsave('plots/digits.png', im, cmap='Greys')

    n_cluster = 30
    classifier = KMeansClassifier(n_cluster=n_cluster, max_iter=100, e=1e-6)

    classifier.fit(x_train, y_train)
    y_hat_test = classifier.predict(x_test)

    assert y_hat_test.shape == y_test.shape, \
        'y_hat_test and y_test should have same shape'

    print('Prediction accuracy of K-means classifier with {} cluster is {}'.
          format(n_cluster, np.mean(y_hat_test == y_test)))

    linear_classifier = LogisticRegression()
    linear_classifier.fit(x_train, y_train)
    y_hat_test = linear_classifier.predict(x_test)
    print('Accuracy of logistic regression classifier is {}'.format(
        np.mean(y_hat_test == y_test)))

    KNNClassifier = KNeighborsClassifier()
    KNNClassifier.fit(x_train, y_train)
    y_hat_test = KNNClassifier.predict(x_test)
    print('Accuracy of Nearest Neighbour classifier is {}'.format(
        np.mean(y_hat_test == y_test)))

    np.savez('results/k_means_classification.npz',
             y_hat_test=y_hat_test,
             y_test=y_test,
             centroids=classifier.centroids,
             centroid_labels=classifier.centroid_labels)
Пример #5
0
    def fit(self, X):
        m = X.shape[0]
        self._clusterAssment = np.zeros((m, 2))
        centroid0 = np.mean(X, axis=0).tolist()
        cenList = [centroid0]
        for j in range(m):  # 计算每个样本点与质心之间初始的平方误差
            self._clusterAssment[j,
                                 1] = self._calEDist(np.asarray(centroid0),
                                                     X[j, :])**2

        while (len(cenList) < self._k):
            lowestSSE = np.inf
            for i in range(len(cenList)):
                index_all = self._clusterAssment[:, 0]  # 取出样本所属簇的索引值
                value = np.nonzero(index_all == i)  # 取出所有属于第i个簇的索引点
                pstInCurrCluster = X[value[0], :]  # 取出属于第i个簇的所有样本点
                clf = KMeansClassifier(k=2)
                clf.fit(pstInCurrCluster)
                centroidMat, splitClustAss = clf._centroids, clf._clusterAssment
                sseSplit = sum(splitClustAss[:, 1])
                index_all = self._clusterAssment[:, 0]
                value = np.nonzero(index_all == i)
                sseNotSplit = sum(self._clusterAssment[value[0], 1])
                if (sseSplit + sseNotSplit) < lowestSSE:
                    bestCentToSplit = i
                    bestNewCents = centroidMat
                    bestClustAss = splitClustAss.copy()
                    lowestSSE = sseSplit + sseNotSplit

            # 该簇被划分为两个子簇后,其中一个子簇的索引变为原簇的索引
            # 另一个子簇的索引变为len(cenList),然后存入cenList。
            bestClustAss[np.nonzero(bestClustAss[:, 0] == 1)[0],
                         0] = len(cenList)
            bestClustAss[np.nonzero(bestClustAss[:, 0] == 0)[0],
                         0] = bestCentToSplit
            cenList[bestCentToSplit] = bestNewCents[0, :].tolist()
            cenList.append(bestNewCents[1, :]).tolist()
            self._clusterAssment[np.nonzero(
                self._clusterAssment[:, 0] ==
                bestCentToSplit)[0], :] = bestClustAss

        self._labels = self._clusterAssment[:, 0]
        self._sse = sum(self._clusterAssment[:, 1])
        self._centroids = np.asarray(cenList)
Пример #6
0
def get_cluster(starttime, stoptime, k):
    '''
    # la focntion qui effectue l'algorithme Kmeans
    # retourne sur un periode et k donne, la datafrme des sation avec leur numeros de cluster.
    '''
    df = get_pandas_df(starttime, stoptime)
    df_origin = df[1]
    df = df[0]

    data_X = np.array(df).astype(np.float)
    clf = KMeansClassifier(k)
    clf.fit(data_X)
    centroids = clf._centroids
    labels = clf._labels
    df_origin["label_kmeans"] = labels.astype('int')

    plt.scatter(df['lon'], df['lat'], c=labels, s=50, alpha=0.5)
    plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=50)
    plt.show()

    return df_origin, centroids
Пример #7
0
"""
import pandas as pd
import numpy as np
from kmeans import KMeansClassifier
import matplotlib.pyplot as plt

#加载数据集,DataFrame格式,最后将返回为一个matrix格式
def loadDataset(infile):
    df = pd.read_csv(infile, sep='\t', header=0, dtype=str, na_filter=False)
    return np.array(df).astype(np.float)

if __name__=="__main__":
    data_X = loadDataset(r"data/testSet.txt")
    k = 3
    clf = KMeansClassifier(k)
    clf.fit(data_X)
    cents = clf._centroids
    labels = clf._labels
    sse = clf._sse
    colors = ['b','g','r','k','c','m','y','#e24fff','#524C90','#845868']
    for i in range(k):
        index = np.nonzero(labels==i)[0]
        x0 = data_X[index, 0]
        x1 = data_X[index, 1]
        y_i = i
        for j in range(len(x0)):
            plt.text(x0[j], x1[j], str(y_i), color=colors[i], \
                        fontdict={'weight': 'bold', 'size': 6})
        plt.scatter(cents[i,0],cents[i,1],marker='x',color=colors[i],\
                    linewidths=7)
    
Пример #8
0
from kmeans import KMeansClassifier


def load_data(path):
    df = pd.read_csv(path, sep="\t", header=0, dtype=str, na_filter=False)
    return np.array(df).astype(np.float)


if __name__ == "__main__":

    project_dir = os.path.dirname(os.path.realpath(__file__))
    data = load_data(os.path.join(project_dir, 'data', 'test.txt'))
    k = 3
    classifier = KMeansClassifier(k)
    classifier.fit(data)
    centers = classifier._centroids
    labels = classifier._labels
    sse = classifier._sse
    print(labels)
    print(sse)
    colors = [
        'b', 'g', 'r', 'k', 'c', 'm', 'y', '#e24fff', '#524C90', '#845868'
    ]
    for i in range(k):
        index = np.nonzero(labels == i)[0]
        x = data[index, 0]
        y = data[index, 1]
        for j in range(len(x)):
            plt.text(x[j],
                     y[j],