Пример #1
0
def chooseK(data_X,data_Y):
    best_rd=0
    best_k=0
    for k in range(1,100):
        clf = KMeansClassifier(k)
        clf.fit(data_X)
        labels = clf._labels
        a=0
        b=0
        c=0
        d=0
        for j in range(len(data_Y)-1,0,-1):
            for i in range(0,j):
                if data_Y[i]==data_Y[j] and labels[i]==labels[j]:
                    a+=1
                elif data_Y[i]==data_Y[j] and labels[i]!=labels[j]:
                    b+=1
                elif data_Y[i]!=data_Y[j] and labels[i]==labels[j]:
                    c+=1
                else:
                    d+=1
        rd=2*(a+d)/(len(data_Y)*(len(data_Y)-1))
        print("rd ",rd,"k ",k)
        if rd>best_rd:
            best_rd=rd
            best_k=k
    return best_rd,best_k
Пример #2
0
def kmeans_classification_builder(centroid_func, x_train, x_test, y_train,
                                  y_test):

    # plot some train data
    N = 25
    l = int(np.ceil(np.sqrt(N)))

    im = np.zeros((10 * l, 10 * l))
    for m in range(l):
        for n in range(l):
            if (m * l + n < N):
                im[10 * m:10 * m + 8,
                   10 * n:10 * n + 8] = x_train[m * l + n].reshape([8, 8])
    plt.imsave('plots/digits.png', im, cmap='Greys')

    n_cluster = 10
    classifier = KMeansClassifier(n_cluster=n_cluster, max_iter=100, e=1e-6)

    classifier.fit(x_train, y_train, centroid_func)
    y_hat_test = classifier.predict(x_test)
    #    print(y_test[0])
    #    print(len(y_test))

    assert y_hat_test.shape == y_test.shape, \
        'y_hat_test and y_test should have same shape'

    print(
        '[*] Prediction accuracy of K-means classifier with {} cluster is {}'.
        format(n_cluster, np.mean(y_hat_test == y_test)))
Пример #3
0
def getkmeansresult(k, data_X, data_od):
    clf = KMeansClassifier(k)
    clf.fit(data_X, data_od)
    cents = clf._centroids
    labels = clf._labels
    flag = True
    for i in range(k):
        index = np.nonzero(labels == i)[0]
        x0 = data_X[index, 0]
        x1 = data_X[index, 1]
        y_i = i
        tmpsum = 0
        for j in range(len(x0)):
            tmpl = np.math.sqrt(
                np.power(cents[i, 0] - x0[j], 2) +
                np.power(cents[i, 1] - x1[j], 2))
            tmpsum += data_od[index, 0][j]
            if tmpl > 3000:
                flag = False
                break
        if tmpsum > 4000:
            flag = False
        if flag == False:
            break
    return flag, clf
Пример #4
0
def kmeans_classification():
    x_train, x_test, y_train, y_test = load_digits()

    # plot some train data
    N = 25
    l = int(np.ceil(np.sqrt(N)))
    #print(l)

    im = np.zeros((10 * l, 10 * l))
    for m in range(l):
        for n in range(l):
            if (m * l + n < N):
                im[10 * m:10 * m + 8,
                   10 * n:10 * n + 8] = x_train[m * l + n].reshape([8, 8])
    plt.imsave('plots/digits.png', im, cmap='Greys')

    n_cluster = 30
    classifier = KMeansClassifier(n_cluster=n_cluster, max_iter=100, e=1e-6)

    classifier.fit(x_train, y_train)
    y_hat_test = classifier.predict(x_test)

    assert y_hat_test.shape == y_test.shape, \
        'y_hat_test and y_test should have same shape'

    print('Prediction accuracy of K-means classifier with {} cluster is {}'.
          format(n_cluster, np.mean(y_hat_test == y_test)))

    linear_classifier = LogisticRegression()
    linear_classifier.fit(x_train, y_train)
    y_hat_test = linear_classifier.predict(x_test)
    print('Accuracy of logistic regression classifier is {}'.format(
        np.mean(y_hat_test == y_test)))

    KNNClassifier = KNeighborsClassifier()
    KNNClassifier.fit(x_train, y_train)
    y_hat_test = KNNClassifier.predict(x_test)
    print('Accuracy of Nearest Neighbour classifier is {}'.format(
        np.mean(y_hat_test == y_test)))

    np.savez('results/k_means_classification.npz',
             y_hat_test=y_hat_test,
             y_test=y_test,
             centroids=classifier.centroids,
             centroid_labels=classifier.centroid_labels)
Пример #5
0
    def fit(self, X):
        m = X.shape[0]
        self._clusterAssment = np.zeros((m, 2))
        centroid0 = np.mean(X, axis=0).tolist()
        cenList = [centroid0]
        for j in range(m):  # 计算每个样本点与质心之间初始的平方误差
            self._clusterAssment[j,
                                 1] = self._calEDist(np.asarray(centroid0),
                                                     X[j, :])**2

        while (len(cenList) < self._k):
            lowestSSE = np.inf
            for i in range(len(cenList)):
                index_all = self._clusterAssment[:, 0]  # 取出样本所属簇的索引值
                value = np.nonzero(index_all == i)  # 取出所有属于第i个簇的索引点
                pstInCurrCluster = X[value[0], :]  # 取出属于第i个簇的所有样本点
                clf = KMeansClassifier(k=2)
                clf.fit(pstInCurrCluster)
                centroidMat, splitClustAss = clf._centroids, clf._clusterAssment
                sseSplit = sum(splitClustAss[:, 1])
                index_all = self._clusterAssment[:, 0]
                value = np.nonzero(index_all == i)
                sseNotSplit = sum(self._clusterAssment[value[0], 1])
                if (sseSplit + sseNotSplit) < lowestSSE:
                    bestCentToSplit = i
                    bestNewCents = centroidMat
                    bestClustAss = splitClustAss.copy()
                    lowestSSE = sseSplit + sseNotSplit

            # 该簇被划分为两个子簇后,其中一个子簇的索引变为原簇的索引
            # 另一个子簇的索引变为len(cenList),然后存入cenList。
            bestClustAss[np.nonzero(bestClustAss[:, 0] == 1)[0],
                         0] = len(cenList)
            bestClustAss[np.nonzero(bestClustAss[:, 0] == 0)[0],
                         0] = bestCentToSplit
            cenList[bestCentToSplit] = bestNewCents[0, :].tolist()
            cenList.append(bestNewCents[1, :]).tolist()
            self._clusterAssment[np.nonzero(
                self._clusterAssment[:, 0] ==
                bestCentToSplit)[0], :] = bestClustAss

        self._labels = self._clusterAssment[:, 0]
        self._sse = sum(self._clusterAssment[:, 1])
        self._centroids = np.asarray(cenList)
Пример #6
0
def get_cluster(starttime, stoptime, k):
    '''
    # la focntion qui effectue l'algorithme Kmeans
    # retourne sur un periode et k donne, la datafrme des sation avec leur numeros de cluster.
    '''
    df = get_pandas_df(starttime, stoptime)
    df_origin = df[1]
    df = df[0]

    data_X = np.array(df).astype(np.float)
    clf = KMeansClassifier(k)
    clf.fit(data_X)
    centroids = clf._centroids
    labels = clf._labels
    df_origin["label_kmeans"] = labels.astype('int')

    plt.scatter(df['lon'], df['lat'], c=labels, s=50, alpha=0.5)
    plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=50)
    plt.show()

    return df_origin, centroids
Пример #7
0
@author: liudiwei
"""
import pandas as pd
import numpy as np
from kmeans import KMeansClassifier
import matplotlib.pyplot as plt

#加载数据集,DataFrame格式,最后将返回为一个matrix格式
def loadDataset(infile):
    df = pd.read_csv(infile, sep='\t', header=0, dtype=str, na_filter=False)
    return np.array(df).astype(np.float)

if __name__=="__main__":
    data_X = loadDataset(r"data/testSet.txt")
    k = 3
    clf = KMeansClassifier(k)
    clf.fit(data_X)
    cents = clf._centroids
    labels = clf._labels
    sse = clf._sse
    colors = ['b','g','r','k','c','m','y','#e24fff','#524C90','#845868']
    for i in range(k):
        index = np.nonzero(labels==i)[0]
        x0 = data_X[index, 0]
        x1 = data_X[index, 1]
        y_i = i
        for j in range(len(x0)):
            plt.text(x0[j], x1[j], str(y_i), color=colors[i], \
                        fontdict={'weight': 'bold', 'size': 6})
        plt.scatter(cents[i,0],cents[i,1],marker='x',color=colors[i],\
                    linewidths=7)
Пример #8
0
import os

from kmeans import KMeansClassifier


def load_data(path):
    df = pd.read_csv(path, sep="\t", header=0, dtype=str, na_filter=False)
    return np.array(df).astype(np.float)


if __name__ == "__main__":

    project_dir = os.path.dirname(os.path.realpath(__file__))
    data = load_data(os.path.join(project_dir, 'data', 'test.txt'))
    k = 3
    classifier = KMeansClassifier(k)
    classifier.fit(data)
    centers = classifier._centroids
    labels = classifier._labels
    sse = classifier._sse
    print(labels)
    print(sse)
    colors = [
        'b', 'g', 'r', 'k', 'c', 'm', 'y', '#e24fff', '#524C90', '#845868'
    ]
    for i in range(k):
        index = np.nonzero(labels == i)[0]
        x = data[index, 0]
        y = data[index, 1]
        for j in range(len(x)):
            plt.text(x[j],
Пример #9
0

if __name__=="__main__":
    #data_X = loadDataset(r"data/testSet.txt")
    #data_X,label_X = readUCIdata()
    #data_X,labelreallist,labellist,drawcolorlist,datalistx,datalisty = readUCIdata1('perfume_data.xlsx')
    #print(data_X[0][1])
    trainingdata,testdata = readUCIIris()
    trainingfeature,traininglabel = splitlabanddata(trainingdata)
    
    data_X = np.array(trainingfeature)
    #k = 4
    k=3
    print(data_X)
    #print(data_X[0])
    clf = KMeansClassifier(k)
    clf.fit(data_X)
    cents = clf._centroids
    labels = clf._labels
    sse = clf._sse
    colors = ['red','purple','darkgreen','darkgray','darksalmon','darkred','olive','yellow','yellowgreen',
'silver','cyan','pink','orangered','orange','navy','magenta','lightgoldenrodyellow',
'lavenderblush','honeydew','mediumseagreen']  

    
    print(cents)
    pred =  clf.predict(data_X)
    print(pred)
    print("The labels is:",labels)
    print(traininglabel)
    colorlist = []
Пример #10
0
class TSP(object):
    def __init__(self):
        self.num_cities=None
        self.cities=None
        self.kmeans=KMeansClassifier(k=30)
        self.cid_to_cities=dict()
        self.visited_col=None
        
    def getdist(self, c1, c2):
        squared_distance = 0.0
        x1=c1[1]; y1=c1[2]
        x2=c2[1]; y2=c2[2]
        squared_distance += (x2 - x1) ** 2

        squared_distance += (y2 - y1) ** 2
        return math.sqrt(squared_distance)
        
    def _readdata(self, fname):
        # import ipdb; ipdb.set_trace()
        citydata=[]
        idx_offset=0
        with open(fname, 'rb') as f1:
            reader=csv.reader(f1, delimiter=' ')
            for line in reader:
                if int(line[0])==0:
                    idx_offset=0
                else:
                    idx_offset=-1
                break

        with open(fname, 'rb') as f:
            reader=csv.reader(f, delimiter=' ')
            for line in reader:
                id=int(line[0])+idx_offset
                x=float(line[1])
                y=float(line[2])
                citydata.append([id, x, y])
        self.cities=np.array([[x for x in city] for city in citydata])
        self.cities[:,0]=self.cities[:,0].astype(int)

    def _build_cid_to_cities(self, cid):
        for (i, cl) in enumerate(cid):
            c=int(cl[0])
            if c in self.cid_to_cities:
                self.cid_to_cities[c].append(i)
            else:
                self.cid_to_cities[c]=[i]
        
    def prepare_data(self, fname):
        # import ipdb; ipdb.set_trace()
        self._readdata(fname)
        # Train to  build cluster
        print '    Clustering.'
        cids,_=self.kmeans.train(self.cities[:,1:], epochs=5)
        self.visited_col=np.zeros((len(self.cities), 1))
        self._build_cid_to_cities(cids)
        print '    Clustering DONE'
        
    def closest_city(self, c):
        inputcity=self.cities[c]
        # import ipdb; ipdb.set_trace()
        cids=self.kmeans.get_closest_clusters(inputcity[1:3], num_clusters=50)
        clustercities=[self.cid_to_cities[i[0]] for i in cids]
        cdist=99999999
        outputcity=c
        for city in sum(clustercities,[]):
            dist=self.getdist(self.cities[city], inputcity)
            if dist==0:
                continue
            if self.visited_col[city]==True:
                continue
            if dist<cdist:
                cdist=dist
                outputcity=city
            elif dist==cdist:
                # If found same distance point, take the min index
                # as outputcity
                if city<outputcity:
                    outputcity=city
        return outputcity

    def traverse(self, c):
        global lastcity
        # import ipdb; ipdb.set_trace()
        if c is None:
            return
        self.visited_col[c]=True
        cc=self.closest_city(c)
        lastcity=cc
        dist=self.getdist(self.cities[c], self.cities[cc])
        if self.visited_col[cc] == False:
            self.visited_col[cc]=True
            dist=dist+self.traverse(cc)
        return dist
        
    def tsp(self):
        # import ipdb; ipdb.set_trace()
        dist=self.traverse(0)
        dist=dist+self.getdist(self.cities[lastcity], self.cities[0])
        return dist
Пример #11
0
 def __init__(self):
     self.num_cities=None
     self.cities=None
     self.kmeans=KMeansClassifier(k=30)
     self.cid_to_cities=dict()
     self.visited_col=None