Пример #1
0
class KMeans:
    
    _bingDocsGenerator = None
    _vectorSpace = None
    _vectorsInfo = [{'vector':[], 'class':'texas aggies'}]
    _classList = ['texas aggies', 'texas longhorns',
                 'duke blue devils','dallas cowboys',
                 'dallas mavericks']
    _expectedNumberOfClusters = 6
    _clusters = {'cluster1' : {'vectorIndices':set(), 'center' : [], 
                               'assignedclasses':{'texas aggies': 0}
                            }
                }
    
    def __init__(self, dataCacheFilePath):
        self._bingDocsGenerator = DocumentsGenerator(dataCacheFilePath)
        self._vectorSpace = VectorSpace(self._populateDocuments())
        self._vectorsInfo =  self._vectorSpace.getAllDocumentVectors()
    
    def _populateDocuments(self):
        print 'Getting documents for clustering'
        jsonData = self._bingDocsGenerator.getDocuments(self._classList)
        return jsonData
    
    def _generateRandomPoint(self):
        numberOfPoints = len(self._vectorsInfo)
        randomIndex = randint(0, numberOfPoints - 1)
        randomPoint = self._vectorsInfo[randomIndex]
        return randomPoint
    
    def _getDistance(self, docVector1, docVector2):
        return self._vectorSpace.getEuclidianDistance(docVector1, docVector2)
        #return 1 - self._vectorSpace.getCosineSimilarity(docVector1, docVector2)
        
    def _isContained(self, vectorList, vectorElement):
        for vector in vectorList:
            if self._vectorSpace._areEqual(vector['vector'], vectorElement['vector']):
                return True
        return False
        
    def _initializeClusters(self):
        print 'Initializing clusters'
        randomCenters = []
        while True:
            randomCenter = self._generateRandomPoint()
            
            if not self._isContained(randomCenters, randomCenter):
                randomCenters.append(randomCenter)
            
            if len(randomCenters) == self._expectedNumberOfClusters:
                break
        
        tempClusters = {}
        clusterIndex = 0
        for randomCenter in randomCenters:
            tempCluster = {'center' : randomCenter, 'vectorIndices' : set()}
            tempClusterName = 'cluster' + str(clusterIndex)
            clusterIndex += 1
            tempClusters[tempClusterName] = tempCluster
        self._clusters = tempClusters
        
    def _calculateRSS(self):
        distanceSum = 0
        for clusterName in self._clusters:
            cluster = self._clusters[clusterName]
            clusterCenter = cluster['center']['vector']
            for vectorIndex in cluster['vectorIndices']:
                distance = self._getDistance(clusterCenter, self._vectorsInfo[vectorIndex]['vector'])
                distanceSum += distance
        RSS = distanceSum / len(self._clusters)
        return RSS
    
    def _findClosestCluster(self, docVector):
        distanceList = []
        for clusterName in self._clusters:
            cluster = self._clusters[clusterName]
            clusterCenter = cluster['center']['vector']
            distance = self._getDistance(clusterCenter, docVector)
            distanceList.append((clusterName, distance))
        closestClusterName = min(distanceList, key=itemgetter(1))[0]
        return closestClusterName
     
    def _assignDocVectorToCluster(self, docVectorIndex, clusterName):
        cluster = self._clusters[clusterName]
        cluster['vectorIndices'].add(docVectorIndex)
        docVectorClass = self._vectorsInfo[docVectorIndex]['class']
        if not 'assignedclasses' in cluster:
            cluster['assignedclasses'] = {}
        assignedClasses = cluster['assignedclasses']
        if docVectorClass in assignedClasses:
            assignedClasses[docVectorClass] += 1
        else:
            assignedClasses[docVectorClass] = 1
      
    def _calculateCentroids(self):
        for clusterName in self._clusters:
            cluster = self._clusters[clusterName]
            vectors = []
            for vectorIndex in cluster['vectorIndices']:
                vectors.append(self._vectorsInfo[vectorIndex]['vector'])
            if len(vectors) > 0: 
                centroid = self._vectorSpace.getCentroid(vectors)
                cluster['center'] = {'vector': centroid}

    def _removeEmptyClusters(self):
        emptyClusterNames = []
        for clusterName in self._clusters:
            if len(self._clusters[clusterName]['vectorIndices']) == 0:
                emptyClusterNames.append(clusterName)
        for emptyClusterName in emptyClusterNames:
            self._clusters.pop(emptyClusterName)
            
    def _clearClusterMembers(self):
        for clusterName in self._clusters:
            cluster = self._clusters[clusterName]
            cluster['vectorIndices'].clear()
            cluster['assignedclasses'].clear()
    
    def clusterPoints(self):
        print 'Clustering Points'
        self._initializeClusters()
        iterCount = 0
        while True:
            vectorIndex = 0
            # Assign all vectors to clusters
            for vectorInfo in self._vectorsInfo:
                closestClusterName = self._findClosestCluster(vectorInfo['vector'])
                self._assignDocVectorToCluster(vectorIndex, closestClusterName)
                vectorIndex += 1
            print 'Iteration---'
            for clusterName in self._clusters:
                print clusterName ,'--->' ,self._clusters[clusterName]['assignedclasses']
            
            RI = self.getRandIndex()
            print 'RI : ', RI            
            RSS = self._calculateRSS()
            print 'RSS:', RSS
            purity = self.getPurity()
            print 'Purity is', purity

            if iterCount > 10:
                print 'Restarting !!----------------------------------------------------------------------'
                self._initializeClusters()
                iterCount = 0
                continue
                        
            self._calculateCentroids()
            self._clearClusterMembers()
            iterCount += 1
            

    def _getVectorClassCounts(self):
        vectorCountDict = {}
        for clusterName in self._clusters:
            cluster = self._clusters[clusterName]
            vectorCountDict[clusterName] = {}
            for className in self._classList:
                vectorCountDict[clusterName][className] = 0
                for vectorIndex in cluster['vectorIndices']:
                    if self._vectorsInfo[vectorIndex]['class'] == className:
                        vectorCountDict[clusterName][className] += 1
        return vectorCountDict


    def getMaxCountClass(self, vectorCountDict, clusterName):
            maxCount = 0
            classCountDict = vectorCountDict[clusterName]
            for className in classCountDict:
                classCount = classCountDict[className]
                if classCount > maxCount:
                    maxCount = classCount
            return maxCount

  
    def _belongToSameCluster(self, vectorIndex1, vectorIndex2):
        for clusterName in self._clusters:
            cluster = self._clusters[clusterName]
            vectorIndices = cluster['vectorIndices']
            if  vectorIndex1 in vectorIndices and vectorIndex2 in vectorIndices:
                return True
        return False
        
    def getPurity(self):
        print 'Calculating purity'
        vectorCountDict = self._getVectorClassCounts()
        maxCountSum = 0
        for clusterName in vectorCountDict:
            maxCount = self.getMaxCountClass(vectorCountDict, clusterName)
            maxCountSum += maxCount
        purity = maxCountSum / float(len(self._vectorsInfo))
        return purity
    
    def getRandIndex(self):
        print 'Calculating Rand Index'
        falsePositivesCount = 0
        falseNegativesCount = 0
        truePositivesCount = 0
        trueNegativesCount = 0
        iterCount = 0
        for vectorIndex1 in range(len(self._vectorsInfo)):
            for vectorIndex2 in range(len(self._vectorsInfo)):
                if vectorIndex1 == vectorIndex2:
                    continue
                else:
                    iterCount += 1
                    vectorInfo1 = self._vectorsInfo[vectorIndex1]
                    vectorInfo2 = self._vectorsInfo[vectorIndex2]
                    haveSameClass = vectorInfo1['class'] == vectorInfo2['class']
                    haveSameCluster = self._belongToSameCluster(vectorIndex1, vectorIndex2)

                    if haveSameClass:
                        if haveSameCluster:
                            truePositivesCount += 1
                        else :
                            falseNegativesCount += 1
                    else:
                        if haveSameCluster:
                            falsePositivesCount += 1
                        else :
                            trueNegativesCount += 1
        print 'Number of iterations :', iterCount
        print 'TN:', trueNegativesCount/ 2, ' TP: ', truePositivesCount/2
        print 'FP: ', falsePositivesCount/2, 'FN: ', falseNegativesCount/2
        total = trueNegativesCount + truePositivesCount + falseNegativesCount + falsePositivesCount
        RI = float(truePositivesCount + trueNegativesCount) / total
        return RI