示例#1
0
minSSE = sys.maxint
minInitialMeans = np.zeros((k, len(points[0])))
minMeans = np.zeros((k, len(points[0])))
minLabels = np.zeros(len(points), dtype=int)



    
# initialize means
numOfExperiments = 5 #number of random initial means experiments
for count in xrange(numOfExperiments):
    if numOfExperiments == 1:
        initialMeans[:] = np.loadtxt(os.path.join(os.path.dirname(__file__), fninitcentroids))
    else:
        initialMeans[:] = Utils.getInitialMeans(points, k)    
        
    print "Initial Means: \n%s" % initialMeans
    print "Initial SSE: %s" % Utils.calcSSE(points, initialMeans)
    
    means = np.zeros((k, len(points[0])))
    means[:] = initialMeans
    finalmeans, labels, SSE = km.run(points, means, numberOfClusters=k, threshold=d, maxiterations=i)
    print "final means: %s" % finalmeans

    if SSE < minSSE:
        print '**New min SSE %s' % SSE
        minSSE = SSE
        minInitialMeans[:] = initialMeans
        print 'Min initial means\n%s' % initialMeans
        minMeans[:] = finalmeans
示例#2
0
import numpy as np
import os
from sklearn import metrics
from sklearn.metrics import pairwise_distances
from util.utilities import Utils

fn = os.path.join(os.path.dirname(__file__), 'data/zdata.txt')
print "opening file %s" % fn
points = np.loadtxt(fn)

#normalize using z-score or sigmoid
points = Utils.zscore(points)
#points = Utils.sigmoid(points)
#points = whiten(points)


minLabels = np.loadtxt(os.path.join(os.path.dirname(__file__), 'data/tmp/minLabels.txt'))

#print 'sc kmeans2 %s ' % metrics.silhouette_score(points, idx, metric='euclidean')
print 'sc kmeans user %s ' % metrics.silhouette_score(points, minLabels, metric='euclidean')
示例#3
0
    def run(self, data, means, numberOfClusters, threshold, maxiterations):
        # initialize means
#        means = np.array(random.sample(data, numberOfClusters))
#        print "Initial Means: \n%s" % means
        
        pointsInClusters = np.zeros(numberOfClusters)
        SSE = 0
        labels = np.zeros(len(data), dtype=int)
        iteration = 0
        
        total = 0
        #plotting
#        self.doPlots(data, labels, means, iteration, title='iteration %i:' % (iteration))
#        import pdb;pdb.set_trace()
        while iteration < maxiterations:
            start = time()
            
            #initialize labels for each iteration
            iteration += 1
            print "Iteration %d" % iteration 
            
            #Distance matrics version
            #Using distance matrix calculations
#            #Calculating the distance to nearest cluster
            meansNew, pointsInClusters = Utils.calcNewMeans(data, means)
            
            #calculate a new mean for each cluster
#            meansNew, pointsInClusters = self.calculateMeans(data, labels, numberOfClusters)
            

            #find nearest centroid, where line is a data vector
            
            #feature diff feature version!
#            meansNew = np.zeros((numberOfClusters, len(data[0])))
#            pointsInClusters = np.zeros(numberOfClusters)
#            for i in xrange(len(data)):
#                mindist = sys.maxint
#                minCentroid = None
#                point = data[i]  
#                for idx in xrange(numberOfClusters):
#                    d = 0
#                    for i in xrange(len(means[idx])):
#                        c = means[idx]
#                        d += abs(point[i] - c[i])**2
#                    
#                    d = math.sqrt(d)
#                    if(d < mindist or minCentroid == None):
#                        mindist = d
#                        minCentroid = idx
#                labels[i] = minCentroid
#                meansNew[minCentroid] += point
#                pointsInClusters[minCentroid] += 1
#            
#            for i in xrange(len(meansNew)):
#                meansNew[i] = meansNew[i] / float(pointsInClusters[i])
                
                
                #Point diff Point (vectorisation)!
#            meansNew = np.zeros((numberOfClusters, len(data[0])))
#            pointsInClusters = np.zeros(numberOfClusters)
#            for i in xrange(len(data)):
#                mindist = sys.maxint
#                minCentroid = None
#                point = data[i]  
#                for idx in xrange(numberOfClusters):
#                    d = np.sqrt(np.sum((point-means[idx])**2))
#                    if(d < mindist or minCentroid == None):
#                        mindist = d
#                        minCentroid = idx
#                labels[i] = minCentroid
#                meansNew[minCentroid] += point
#                pointsInClusters[minCentroid] += 1
#            
#            for i in xrange(len(meansNew)):
#                meansNew[i] = meansNew[i] / float(pointsInClusters[i])

            #Point diff Array version (vectorisation)
#            Using point and centroids calculations, like used in MR k-means
#            Calculating the distance to nearest cluster and new mean
#            pointsInClusters = np.zeros(numberOfClusters)
#            meansNew = np.zeros((numberOfClusters, len(data[0])))
#            for i in xrange(len(data)):
#                point = data[i]
#                d = np.sqrt(np.sum((point-means)**2,axis=1))
#                minCentroidIdx = d.argmin()
#                labels[i] = minCentroidIdx
#                meansNew[minCentroidIdx] += point
#                pointsInClusters[minCentroidIdx] += 1
#            
#            for i in xrange(len(meansNew)):
#                meansNew[i] = meansNew[i] / float(pointsInClusters[i])
                
                
            #measure calculation time
            end = time()
            print 'time: %f' % (end-start)
            total += (end-start)
            
            #check if the means have changed  
            meansDiff = 0  
            for i in xrange(numberOfClusters):
                pprint('%s %s' % (i, meansNew[i]))
                meansDiff += distance.euclidean(meansNew[i], means[i])
                
            print 'Means difference: %f' % meansDiff
            
            #calculate the within cluster variation, sum of squared distances between all objects in cluster and its centroid
            SSE = Utils.calcSSE(data, meansNew)            
            print "SSE: %0.3f" % SSE
            
            #plotting
#            self.doPlots(data, labels, means, iteration, title='iteration %i meansdiff: %f' % (iteration, meansDiff))
                
            means[:] = meansNew
            if meansDiff < threshold:
                break
            
            
        
        #End of While loop
        #KMeans iterative process ends here
        
        #If mean difference under threshold or there are max iterations
        if iteration == maxiterations:
            print "Max iterations reached: %d" % iteration
        else:
            print "Means difference: %f is under threshold %f" % (meansDiff, threshold)
            
        #print "Clusters:"
        for i in xrange(numberOfClusters):
            print "Cluster %d, number of points %d" % (i, pointsInClusters[i])
        
        if self.showsubplots:
            Plot.subplotClusters(data, labels, means, iteration, title='final means')
            pylab.show()
            
        print 'total time: %f' %total
       
            
        return means, labels, SSE