Exemplo n.º 1
0
 def test_addDocument(self):
     message1 = Message(3, 'sdf',
                        'A project to cluster high-dimensional streams.',
                        test_time)
     message1.vector = Vector({3: 4})
     stream1 = Stream(3, message1)
     message2 = Message(4, 'sdf',
                        'A project to cluster high-dimensional streams.',
                        test_time)
     message2.vector = Vector({2: 4})
     stream2 = Stream(4, message2)
     self.assertNotEqual(test_time, self.cluster1.lastStreamAddedTime)
     self.cluster1.addDocument(stream1)
     self.assertEqual(test_time, self.cluster1.lastStreamAddedTime)
     # Test if cluster id is set.
     self.assertEqual(self.cluster1.clusterId, stream1.clusterId)
     # Test that cluster mean is updated.
     self.assertEqual({1: 2 / 2., 2: 2., 3: 2.}, self.cluster1)
     # Test that cluster aggrefate is updated.
     self.assertEqual({1: 2, 2: 4, 3: 4}, self.cluster1.aggregateVector)
     # Test that document is added to cluster documents.
     self.assertEqual(stream1,
                      self.cluster1.documentsInCluster[stream1.docId])
     self.cluster1.addDocument(stream2)
     self.assertEqual(3, self.cluster1.vectorWeights)
     self.assertEqual({1: 2 / 3., 2: 8 / 3., 3: 4 / 3.}, self.cluster1)
     self.assertEqual({1: 2, 2: 8, 3: 4}, self.cluster1.aggregateVector)
Exemplo n.º 2
0
 def setUp(self):
     self.message = Message(
         1, 'sdf', 'A project to cluster high-dimensional streams.',
         datetime.now())
     self.message.vector = Vector({1: 2., 2: 3.})
     self.s1 = Stream(1, self.message)
     self.v1 = Vector({1: 2., 3: 3.})
Exemplo n.º 3
0
 def mapper(self, _, value):
     if False: yield  # I'm a generator!
     [(id0, vec0), (id1, vec1)] = value
     vec0, vec1 = Vector(vec0), Vector(vec1)
     if vec0.cosineSimilarity(vec1) >= self.ssa_threshold:
         self.streamIdToSimilarStreamsMap[id0].add(
             id1
         ) if id0 < id1 else self.streamIdToSimilarStreamsMap[id1].add(id0)
Exemplo n.º 4
0
 def setUp(self):
     Cluster.clusterIdCounter = 0
     self.docx = Document(1, {1: 2, 2: 4})
     self.docy = Document(2, {2: 4})
     self.cluster1 = Cluster(self.docx)
     self.cluster2 = Cluster(self.docy)
     self.doc1 = Document(3, Vector({3: 4}))
     self.doc2 = Document(4, Vector({2: 4}))
Exemplo n.º 5
0
 def setUp(self):
     self.m1 = Message(1, 'sdf',
                       'A project to cluster high-dimensional streams.',
                       test_time - timedelta(seconds=60))
     self.m1.vector = Vector({1: 1., 2: 3.})
     self.stream = Stream(1, self.m1)
     self.m2 = Message(1, 'sdf',
                       'A project to cluster high-dimensional streams.',
                       test_time)
     self.m2.vector = Vector({2: 3.})
Exemplo n.º 6
0
    def cluster(self, dataIterator):
        i = 1
        for data in dataIterator:
            message = self.convertDataToMessageMethod(data,
                                                      **self.stream_settings)
            #            message = data
            if DataStreamMethods.messageInOrder(message.timeStamp):
                UtilityMethods.updatePhraseTextToPhraseObject(
                    message.vector, message.timeStamp,
                    self.phraseTextToPhraseObjectMap, **self.stream_settings)
                if message.streamId not in self.streamIdToStreamObjectMap:
                    self.streamIdToStreamObjectMap[message.streamId] = Stream(
                        message.streamId, message)
                    self.getClusterAndUpdateExistingClusters(
                        self.streamIdToStreamObjectMap[message.streamId])
                else:
                    previousStreamObject = Vector(
                        vectorInitialValues=self.streamIdToStreamObjectMap[
                            message.streamId])
                    self.streamIdToStreamObjectMap[
                        message.streamId].updateForMessage(
                            message, VectorUpdateMethods.exponentialDecay,
                            **self.stream_settings)
                    streamObject = self.streamIdToStreamObjectMap[
                        message.streamId]
                    distance = Vector.euclideanDistance(
                        streamObject, previousStreamObject)
                    if distance > 10:
                        #                        print i, len(self.clusters), distance
                        self.getClusterAndUpdateExistingClusters(
                            self.streamIdToStreamObjectMap[message.streamId])

                        self.updateDimensionsMethod.call(
                            message.timeStamp,
                            hdStreamClusteringObject=self,
                            currentMessageTime=message.timeStamp)
                        self.clusterFilteringMethod.call(
                            message.timeStamp,
                            hdStreamClusteringObject=self,
                            currentMessageTime=message.timeStamp)

        #                self.clusterAnalysisMethod.call(message.timeStamp, hdStreamClusteringObject=self, currentMessageTime=message.timeStamp)

                    self.clusterAnalysisMethod.call(
                        time.time(),
                        hdStreamClusteringObject=self,
                        currentMessageTime=message.timeStamp,
                        numberOfMessages=i)

#                print i, len(self.clusters)
                i += 1
    def cluster(self, dataIterator):
        i = 1
        for data in dataIterator:
            message = self.convertDataToMessageMethod(data, **self.stream_settings)
#            message = data
            if DataStreamMethods.messageInOrder(message.timeStamp):
                UtilityMethods.updatePhraseTextToPhraseObject(message.vector, message.timeStamp, self.phraseTextToPhraseObjectMap, **self.stream_settings)
                if message.streamId not in self.streamIdToStreamObjectMap: 
                    self.streamIdToStreamObjectMap[message.streamId] = Stream(message.streamId, message)
                    self.getClusterAndUpdateExistingClusters(self.streamIdToStreamObjectMap[message.streamId])
                else: 
                    previousStreamObject=Vector(vectorInitialValues=self.streamIdToStreamObjectMap[message.streamId])
                    self.streamIdToStreamObjectMap[message.streamId].updateForMessage(message, VectorUpdateMethods.exponentialDecay, **self.stream_settings )
                    streamObject=self.streamIdToStreamObjectMap[message.streamId]
                    distance = Vector.euclideanDistance(streamObject, previousStreamObject)
                    if distance>10: 
#                        print i, len(self.clusters), distance
                        self.getClusterAndUpdateExistingClusters(self.streamIdToStreamObjectMap[message.streamId])

                        self.updateDimensionsMethod.call(message.timeStamp, hdStreamClusteringObject=self, currentMessageTime=message.timeStamp)
                        self.clusterFilteringMethod.call(message.timeStamp, hdStreamClusteringObject=self, currentMessageTime=message.timeStamp)
        
        #                self.clusterAnalysisMethod.call(message.timeStamp, hdStreamClusteringObject=self, currentMessageTime=message.timeStamp)
        
                    self.clusterAnalysisMethod.call(time.time(), hdStreamClusteringObject=self, currentMessageTime=message.timeStamp, numberOfMessages=i)

#                print i, len(self.clusters)
                i+=1
Exemplo n.º 8
0
 def setUp(self):
     self.phraseVector = {
         'project': 1,
         'cluster': 1,
         'highdimensional': 1,
         'streams': 1
     }
     self.phraseTextAndDimensionMap = TwoWayMap()
     self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'project', 0)
     self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 1)
     self.phraseTextToPhraseObjectMap = {
         'project':
         Phrase('project', test_time, score=8),
         'cluster':
         Phrase('cluster', test_time, score=8),
         'abcd':
         Phrase(
             'abcd',
             test_time -
             3 * stream_settings['max_phrase_inactivity_time_in_seconds'],
             score=8)
     }
     self.vector = Vector({0: 1, 1: 1, 2: 1, 3: 1})
     self.initial_max_dimensions = stream_settings['dimensions']
     stream_settings['dimensions'] = 2
Exemplo n.º 9
0
 def _getVectorMappedToDimension(self, vector, phraseTextAndDimensionMap):
     mappedVector = Vector()
     phraseToDimensionMap = phraseTextAndDimensionMap.getMap(
         TwoWayMap.MAP_FORWARD)
     for phrase in self:
         if phrase in phraseToDimensionMap:
             mappedVector[phraseToDimensionMap[phrase]] = self[phrase]
     return mappedVector
Exemplo n.º 10
0
 def test_mergeCluster_compare_vector_lastStreamAddedTime_more_than_original_cluster(self):
     mergedCluster = StreamCluster.getClusterObjectToMergeFrom(self.cluster1)
     mergedCluster.mergeCluster(self.cluster2)
     self.assertEqual([self.stream1, self.stream2], list(mergedCluster.iterateDocumentsInCluster()))
     meanVectorForAllDocuments = Vector.getMeanVector([self.stream1, self.stream2])
     self.assertEqual(meanVectorForAllDocuments, mergedCluster)
     self.assertEqual([mergedCluster.docId, mergedCluster.docId], list(doc.clusterId for doc in mergedCluster.iterateDocumentsInCluster()))
     self.assertEqual(self.cluster2.lastStreamAddedTime, mergedCluster.lastStreamAddedTime)
Exemplo n.º 11
0
 def getClusterObjectToMergeFrom(streamCluster):
     streamCluster.lastMessageTime = streamCluster.lastStreamAddedTime
     mergedCluster = StreamCluster(streamCluster,
                                   shouldUpdateDocumentId=False)
     mergedCluster.aggregateVector, mergedCluster.vectorWeights = Vector(
         {}), 0.0
     StreamCluster.updateClusterAttributes(mergedCluster, streamCluster)
     return mergedCluster
Exemplo n.º 12
0
 def test_maxClusterSize(self):
     self.assertEqual(1, self.crowd.maxClusterSize)
     message2 = Message(4, 'sdf',
                        'A project to cluster high-dimensional streams.',
                        test_time)
     message2.vector = Vector({2: 4})
     stream2 = Stream(4, message2)
     self.cluster.addDocument(stream2)
     self.assertEqual(2, self.crowd.maxClusterSize)
Exemplo n.º 13
0
 def setUp(self):
     self.m1 = Message(1, 'sdf',
                       'A project to cluster high-dimensional streams.',
                       test_time - timedelta(seconds=60))
     self.m1.vector = Vector({1: 2, 2: 4})
     self.stream1 = Stream(1, self.m1)
     self.m2 = Message(2, 'sdf',
                       'A project to cluster high-dimensional streams.',
                       test_time)
     self.m2.vector = Vector({2: 4})
     self.stream2 = Stream(2, self.m2)
     self.m3 = Message(3, 'sdf',
                       'A project to cluster high-dimensional streams.',
                       test_time + timedelta(seconds=60))
     self.m3.vector = Vector({2: 4})
     self.stream3 = Stream(3, self.m3)
     self.cluster1 = StreamCluster(self.stream1)
     self.cluster2 = StreamCluster(self.stream2)
     self.cluster3 = StreamCluster(self.stream3)
Exemplo n.º 14
0
 def createDocumentFromLine(docId, line):
     vector = Vector()
     words = line.split()
     for word in words[1:]:
         if word not in wordToDimensionMap:
             wordToDimensionMap[word] = len(wordToDimensionMap)
         wordDimension = wordToDimensionMap[word]
         if wordDimension not in vector: vector[wordDimension] = 1
         else: vector[wordDimension] += 1
     return Document(docId, vector, clusterId=words[0])
Exemplo n.º 15
0
 def convertTweetJSONToMessage(tweet, **twitter_stream_settings):
     tweetTime = getDateTimeObjectFromTweetTimestamp(tweet['created_at'])
     message = Message(tweet['user']['screen_name'], tweet['id'],
                       tweet['text'], tweetTime)
     message.vector = Vector()
     for phrase in getPhrases(getWordsFromRawEnglishMessage(tweet['text']),
                              twitter_stream_settings['min_phrase_length'],
                              twitter_stream_settings['max_phrase_length']):
         if phrase not in message.vector: message.vector[phrase] = 0
         message.vector[phrase] += 1
     return message
Exemplo n.º 16
0
 def getClusterFromMapFormat(clusterMap):
     dummyMessage = Message(1, '', '', datetime.now())
     dummyMessage.vector = Vector({})
     dummyStream = Stream(1, dummyMessage)
     cluster = StreamCluster(dummyStream)
     cluster.clusterId = clusterMap['clusterId']
     cluster.lastStreamAddedTime = getDateTimeObjectFromTweetTimestamp(
         clusterMap['lastStreamAddedTime'])
     cluster.mergedClustersList = clusterMap['mergedClustersList']
     cluster.documentsInCluster = clusterMap['streams']
     for k, v in clusterMap['dimensions'].iteritems():
         cluster[k] = v
     return cluster
Exemplo n.º 17
0
    def plot_points(self, fr=-5.0, to=5.0, values=50, name='cubic'):
        vs = []

        points = np.linspace(fr, to, values, endpoint=True)

        # default cubic function
        # f = self.cubic

        # check if given name exists and if so, let f be function
        if name in self.EQUATIONS:
            f = getattr(Function, name)  # returns the function

        for i in points:
            vs.append(Vector([i, f(0, i)]))
        return vs
Exemplo n.º 18
0
 def test_setSignatureUsingVectors(self):
     phraseTextAndDimensionMap = TwoWayMap()
     phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'a', 1)
     phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'b', 2)
     documentWithDimensionsInVector = Document(1, {'a': 1, 'b': 4})
     documentWithDimensionsNotInVector = Document(1, {'a': 1, 'c': 4})
     vectors = [
         Vector({
             1: 3 / 5.,
             2: -4 / 5.
         }),
         Vector({
             1: -5 / 13.,
             2: 12 / 13.
         })
     ]
     documentWithDimensionsInVector.setSignatureUsingVectors(
         vectors, phraseTextAndDimensionMap)
     documentWithDimensionsNotInVector.setSignatureUsingVectors(
         vectors, phraseTextAndDimensionMap)
     self.assertEqual(Signature('01'),
                      documentWithDimensionsInVector.signature)
     self.assertEqual(Signature('10'),
                      documentWithDimensionsNotInVector.signature)
Exemplo n.º 19
0
def iterateUserDocuments(fileName):
    dataForAggregation = defaultdict(Vector)
    textToIdMap = defaultdict(int)
    for tweet in FileIO.iterateJsonFromFile(fileName):
        textVector = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage(
            tweet, **default_experts_twitter_stream_settings).vector
        textIdVector = Vector()
        for phrase in textVector:
            if phrase not in textToIdMap:
                textToIdMap[phrase] = str(len(textToIdMap))
            textIdVector[textToIdMap[phrase]] = textVector[phrase]
        dataForAggregation[tweet['user']
                           ['screen_name'].lower()] += textIdVector
    for k, v in dataForAggregation.iteritems():
        yield k, v
Exemplo n.º 20
0
def iterateTweetUsersAfterCombiningTweets(fileName, **stream_settings):
    dataForAggregation = defaultdict(Vector)
    textToIdMap = defaultdict(int)
    for tweet in TweetFiles.iterateTweetsFromGzip(fileName):
        textVector = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage(
            tweet, **stream_settings).vector
        textIdVector = Vector()
        for phrase in textVector:
            if phrase not in textToIdMap:
                textToIdMap[phrase] = str(len(textToIdMap))
            textIdVector[textToIdMap[phrase]] = textVector[phrase]
        dataForAggregation[tweet['user']
                           ['screen_name'].lower()] += textIdVector
    for k, v in dataForAggregation.iteritems():
        yield k, v
Exemplo n.º 21
0
 def test_mergeCluster_compare_vector_lastStreamAddedTime_more_than_original_cluster(
         self):
     mergedCluster = StreamCluster.getClusterObjectToMergeFrom(
         self.cluster1)
     mergedCluster.mergeCluster(self.cluster2)
     self.assertEqual([self.stream1, self.stream2],
                      list(mergedCluster.iterateDocumentsInCluster()))
     meanVectorForAllDocuments = Vector.getMeanVector(
         [self.stream1, self.stream2])
     self.assertEqual(meanVectorForAllDocuments, mergedCluster)
     self.assertEqual(
         [mergedCluster.docId, mergedCluster.docId],
         list(doc.clusterId
              for doc in mergedCluster.iterateDocumentsInCluster()))
     self.assertEqual(self.cluster2.lastStreamAddedTime,
                      mergedCluster.lastStreamAddedTime)
 def setUp(self):
     self.tweet = {'user':{'screen_name': 'abc'}, 'id':10, 'text':'A project to cluster high-dimensional streams.', 'created_at': 'Tue Mar 01 05:59:59 +0000 2011'}
     m1 = Message(1, '', '', datetime.now())
     m1.vector=Vector({'#tcot':2,'dsf':4})
     self.cluster1 = StreamCluster(Stream(1, m1))
     m2 = Message(2, '', '', datetime.now())
     m2.vector=Vector({'#tcot':4})
     self.cluster2 = StreamCluster(Stream(2, m2))
     m3 = Message(3, '', '', datetime.now())
     m3.vector=Vector(Vector({'#tcot':2}))
     m4 = Message(4, '', '', datetime.now())
     m4.vector=Vector(Vector({'#tcot':2}))
     self.doc1 = Stream(1, m3)
     self.doc2 = Stream(2, m4)
     self.meanVectorForAllDocuments = Vector.getMeanVector([self.cluster1, self.cluster2, self.doc1, self.doc2])
     self.cluster1.addDocument(self.doc1)
     self.cluster2.addDocument(self.doc2)
Exemplo n.º 23
0
 def test_exponentialDecay(self):
     VectorUpdateMethods.exponentialDecay(self.s1, self.v1, 0.5, 1)
     self.assertEqual(Vector({1: 3, 2: 1.5, 3: 3}), self.s1)
Exemplo n.º 24
0
 def _getDocumentFromTuple((user, text)):
     vector, words = Vector(), text.split()
     for word in words[1:]:
         if word not in vector: vector[word] = 1
         else: vector[word] += 1
     return Document(user, vector)
Exemplo n.º 25
0
 def add_arrow_vector(self, vector, colour='k', from_vec=Vector([0, 0])):
     self.update_size_if_required(vector)
     arrow_buff = self.get_arrow_buffer()
     self.arrows.append([from_vec, vector, colour])
Exemplo n.º 26
0
 def test_addWithoutDecay(self):
     VectorUpdateMethods.addWithoutDecay(self.s1, self.v1)
     self.assertEqual(Vector({1: 4, 2: 3, 3: 3}), self.s1)
Exemplo n.º 27
0
@author: kykamath
'''
import sys, os, unittest, cjson
sys.path.append('../../../')
from library.vector import Vector
from itertools import combinations
from experiments.ssa.ssa_sim_mr import SSASimilarityMR
from experiments.ssa.ssa import StreamSimilarityAggregationMR, ItemsClusterer,\
    SimilarStreamAggregation

test_file = 'ssa_test.dat.gz'
test_ssa_threshold = 0.75

vectors =  {
            '1': Vector({'1':4, '2':8}), 
            '2': Vector({'1':4, '2':8}), 
            '3': Vector({'1':4, '2':8}), 
            '4': Vector({'2':8}), 
            '5': Vector({'3':4, '4':8}), 
            '6': Vector({'4':8}), 
            '7': Vector({'3':4, '4':8}), 
            '8': Vector({'3':4}) 
        }
def createTestFile():
    with open(test_file, 'w') as f:
        for v1, v2 in combinations(vectors.iteritems(),2): f.write('%s\t%s\n'%(cjson.encode(['x']), cjson.encode([(v1[0], v1[1]), (v2[0], v2[1])])))

class ItemsClustererTests(unittest.TestCase):
    def setUp(self): self.clusterer = ItemsClusterer()
    def test_addNewCluster(self):
Exemplo n.º 28
0
 def topDimensions(self, numberOfDimensions=10):
     return Vector.getMeanVector(
         self.clusters.itervalues()).getTopDimensions(
             numberOfFeatures=numberOfDimensions)
def createDocumentFromLine(docId, line):
    vector, words = Vector(), line.split()
    for word in words[1:]:
        if word not in vector: vector[word] = 1
        else: vector[word] += 1
    return Document(words[0], vector)
Exemplo n.º 30
0
 def mapper(self, _, value):
     if False: yield # I'm a generator!
     [(id0, vec0), (id1, vec1)] = value
     vec0, vec1 = Vector(vec0), Vector(vec1)
     if vec0.cosineSimilarity(vec1)>=self.ssa_threshold: self.streamIdToSimilarStreamsMap[id0].add(id1) if id0<id1 else self.streamIdToSimilarStreamsMap[id1].add(id0)
Exemplo n.º 31
0
 def topDimensions(self, numberOfDimensions=10):
     return Vector.getMeanVector(self.clusters.itervalues()).getTopDimensions(numberOfFeatures=numberOfDimensions)
Exemplo n.º 32
0
 def test_updateForMessage_exponentialDecay(self):
     self.stream.updateForMessage(self.m2,
                                  VectorUpdateMethods.exponentialDecay,
                                  **stream_settings)
     self.assertEqual(self.stream, Vector({1: 0.5, 2: 4.5}))
Exemplo n.º 33
0
 def test_updateForMessage_addWithoutDecay(self):
     self.stream.updateForMessage(self.m2,
                                  VectorUpdateMethods.addWithoutDecay,
                                  **stream_settings)
     self.assertEqual(self.stream, Vector({1: 1., 2: 6.}))
Exemplo n.º 34
0
def meanClusteringDistance(clusterMeans): return np.mean([Vector.euclideanDistance(Vector(dict(c1)), Vector(dict(c2))) for c1, c2 in combinations(clusterMeans,2)])

def writeLocationToUserMap(place):
Exemplo n.º 35
0
def offlineLSHClusteringDemo():
    wordToDimensionMap = {}

    def createDocumentFromLine(docId, line):
        vector = Vector()
        words = line.split()
        for word in words[1:]:
            if word not in wordToDimensionMap:
                wordToDimensionMap[word] = len(wordToDimensionMap)
            wordDimension = wordToDimensionMap[word]
            if wordDimension not in vector: vector[wordDimension] = 1
            else: vector[wordDimension] += 1
        return Document(docId, vector, clusterId=words[0])

    dimensions = 53
    signatureLength = 13
    numberOfPermutations = 5

    unitVector = RandomGaussianUnitVector(dimensions=dimensions, mu=0, sigma=1)
    vectorPermutations = VectorPermutation.getPermutations(
        signatureLength, dimensions, unitVector)
    signaturePermutations = [
        SignaturePermutationWithTrie(signatureLength)
        for i in range(numberOfPermutations)
    ]

    permutatedUnitVectors = [
        unitVector.getPermutedVector(r) for r in vectorPermutations
    ]

    # Build LSH Model.
    # Read training documents.
    traningDocumentsMap = {}
    for docId, l in enumerate(
            FileIO.iterateLinesFromFile('../data/train_offline.dat')):
        traningDocumentsMap[docId] = createDocumentFromLine(docId, l)
    # Construct cluster vectors.
    clusterToDocumentsMap = defaultdict(list)
    for document in traningDocumentsMap.values():
        clusterToDocumentsMap[document.clusterId].append(document)
    clusterMap = {}
    for k, v in clusterToDocumentsMap.iteritems():
        clusterMap[k] = Document(docId=k,
                                 vector=Vector.getMeanVector(v),
                                 clusterId=k)

    # Create signatures and signaturePermutations for all the clusters.
    map(
        lambda document: document.setSignatureUsingVectors(
            permutatedUnitVectors), clusterMap.values())
    for permutation in signaturePermutations:
        for document in clusterMap.values():
            permutation.addDocument(document)

    # Testing the model.
    # Read testing documents.
    testDocumentsMap = {}
    for docId, l in enumerate(
            FileIO.iterateLinesFromFile('../data/test_offline.dat')):
        testDocumentsMap[docId] = createDocumentFromLine(docId, l)
    # Create signatures for test documents
    map(
        lambda document: document.setSignatureUsingVectors(
            permutatedUnitVectors), testDocumentsMap.values())

    predicted, labels = [], []
    for t in testDocumentsMap.values():
        possibleNearestClusters = reduce(
            lambda x, y: x.union(y),
            (permutation.getNearestDocuments(t)
             for permutation in signaturePermutations), set())
        predictedClass = max(
            ((clusterId, clusterMap[clusterId].cosineSimilarity(t))
             for clusterId in possibleNearestClusters),
            key=itemgetter(1))
        predicted.append(predictedClass[0])
        labels.append(t.clusterId)
    return EvaluationMetrics.purity(predicted, labels)