def clusterFilteringMethod(hdStreamClusteringObject, currentMessageTime):
     for cluster in StreamCluster.getClustersByAttributeAndThreshold(hdStreamClusteringObject.clusters.values(), hdStreamClusteringObject.stream_settings['cluster_filter_attribute'], 
                                                               hdStreamClusteringObject.stream_settings['cluster_filter_threshold'], StreamCluster.BELOW_THRESHOLD): del hdStreamClusteringObject.clusters[cluster.clusterId]
     for cluster in StreamCluster.getClustersByAttributeAndThreshold(hdStreamClusteringObject.clusters.values(), 'lastStreamAddedTime', 
                                                               currentMessageTime-hdStreamClusteringObject.stream_settings['cluster_inactivity_time_in_seconds'], StreamCluster.BELOW_THRESHOLD): del hdStreamClusteringObject.clusters[cluster.clusterId]
     if hdStreamClusteringObject.combineClustersMethod!=None: hdStreamClusteringObject.clusters=hdStreamClusteringObject.combineClustersMethod(hdStreamClusteringObject.clusters, **hdStreamClusteringObject.stream_settings)
     DataStreamMethods._resetClustersInSignatureTries(hdStreamClusteringObject, currentMessageTime)
 def getClusterAndUpdateExistingClusters(self, stream):
     predictedCluster = self.getClusterForDocument(stream)
     if predictedCluster!=None: self.clusters[predictedCluster].addDocument(stream, **self.stream_settings)
     else:
         newCluster = StreamCluster(stream)
         newCluster.setSignatureUsingVectorPermutations(self.unitVector, self.vectorPermutations, self.phraseTextAndDimensionMap)
         for permutation in self.signaturePermutations: permutation.addDocument(newCluster)
         self.clusters[newCluster.clusterId] = newCluster
示例#3
0
 def setUp(self):
     self.m1 = Message(1, 'sdf',
                       'A project to cluster high-dimensional streams.',
                       test_time - timedelta(seconds=60))
     self.m1.vector = Vector({1: 1., 2: 3.})
     self.stream = Stream(1, self.m1)
     self.cluster = StreamCluster(self.stream)
     self.crowd = Crowd(self.cluster, test_time)
 def test_clustersIteration(self):
     clusters = [self.cluster1, self.cluster2, self.cluster3]
     self.assertEqual([self.cluster1],
                       [cluster for cluster in StreamCluster.getClustersByAttributeAndThreshold(clusters, 'lastStreamAddedTime', test_time ,StreamCluster.BELOW_THRESHOLD)]
                     )
     self.assertEqual([self.cluster1, self.cluster2],
                       [cluster for cluster in StreamCluster.getClustersByAttributeAndThreshold(clusters, 'lastStreamAddedTime', test_time+timedelta(seconds=60) ,StreamCluster.BELOW_THRESHOLD)]
                     )
 def getClusterFromMapFormat(clusterMap):
     dummyMessage = Message(1, '', '', datetime.now())
     dummyMessage.vector=Vector({})
     dummyStream=Stream(1, dummyMessage)
     cluster = StreamCluster(dummyStream)
     cluster.clusterId = clusterMap['clusterId']
     cluster.lastStreamAddedTime = getDateTimeObjectFromTweetTimestamp(clusterMap['lastStreamAddedTime'])
     cluster.mergedClustersList = clusterMap['mergedClustersList']
     cluster.documentsInCluster = clusterMap['streams']
     for k,v in clusterMap['dimensions'].iteritems(): cluster[k]=v
     return cluster
 def getClusterAndUpdateExistingClusters(self, stream):
     predictedCluster = self.getClusterForDocument(stream)
     '''
     Do not remove this comment. Might need this if StreamCluster is used again in future.
     if predictedCluster!=None: self.clusters[predictedCluster].addStream(stream, **self.stream_settings)
     '''
     if predictedCluster!=None: self.clusters[predictedCluster].addDocument(stream, **self.stream_settings)
     else:
         newCluster = StreamCluster(stream)
         newCluster.setSignatureUsingVectorPermutations(self.unitVector, self.vectorPermutations, self.phraseTextAndDimensionMap)
         for permutation in self.signaturePermutations: permutation.addDocument(newCluster)
         self.clusters[newCluster.clusterId] = newCluster
示例#7
0
class CrowdTests(unittest.TestCase):
    def setUp(self):
        self.m1 = Message(1, 'sdf',
                          'A project to cluster high-dimensional streams.',
                          test_time - timedelta(seconds=60))
        self.m1.vector = Vector({1: 1., 2: 3.})
        self.stream = Stream(1, self.m1)
        self.cluster = StreamCluster(self.stream)
        self.crowd = Crowd(self.cluster, test_time)

    def test_intitialization(self):
        self.assertEqual(self.cluster.clusterId, self.crowd.crowdId)

    def test_append(self):
        self.crowd.append(self.cluster, test_time + timedelta(days=1))
        self.assertEqual([
            GeneralMethods.getEpochFromDateTimeObject(test_time),
            GeneralMethods.getEpochFromDateTimeObject(test_time +
                                                      timedelta(days=1))
        ], sorted(self.crowd.clusters.keys()))
        self.assertEqual(
            StreamCluster,
            type(self.crowd.clusters[GeneralMethods.getEpochFromDateTimeObject(
                test_time)]))
        self.assertEqual(2, self.crowd.lifespan)
        self.assertEqual(
            getStringRepresentationForTweetTimestamp(test_time),
            getStringRepresentationForTweetTimestamp(self.crowd.startTime))
        self.assertEqual(
            getStringRepresentationForTweetTimestamp(test_time +
                                                     timedelta(days=1)),
            getStringRepresentationForTweetTimestamp(self.crowd.endTime))

    def test_maxClusterSize(self):
        self.assertEqual(1, self.crowd.maxClusterSize)
        message2 = Message(4, 'sdf',
                           'A project to cluster high-dimensional streams.',
                           test_time)
        message2.vector = Vector({2: 4})
        stream2 = Stream(4, message2)
        self.cluster.addDocument(stream2)
        self.assertEqual(2, self.crowd.maxClusterSize)

    def test_crowdSize(self):
        self.assertEqual(1, self.crowd.crowdSize)
        self.cluster.addDocument(Stream(2, self.m1))
        self.cluster.addDocument(Stream(3, self.m1))
        self.assertEqual(3, self.crowd.crowdSize)
        cluster = StreamCluster(Stream(3, self.m1))
        self.crowd.append(cluster, test_time + timedelta(days=2))
        self.assertNotEqual(4, self.crowd.crowdSize)
        self.assertEqual(3, self.crowd.crowdSize)
示例#8
0
 def getClusterAndUpdateExistingClusters(self, stream):
     predictedCluster = self.getClusterForDocument(stream)
     if predictedCluster != None:
         self.clusters[predictedCluster].addDocument(
             stream, **self.stream_settings)
     else:
         newCluster = StreamCluster(stream)
         newCluster.setSignatureUsingVectorPermutations(
             self.unitVector, self.vectorPermutations,
             self.phraseTextAndDimensionMap)
         for permutation in self.signaturePermutations:
             permutation.addDocument(newCluster)
         self.clusters[newCluster.clusterId] = newCluster
示例#9
0
 def test_clustersIteration(self):
     clusters = [self.cluster1, self.cluster2, self.cluster3]
     self.assertEqual([self.cluster1], [
         cluster
         for cluster in StreamCluster.getClustersByAttributeAndThreshold(
             clusters, 'lastStreamAddedTime', test_time,
             StreamCluster.BELOW_THRESHOLD)
     ])
     self.assertEqual([self.cluster1, self.cluster2], [
         cluster
         for cluster in StreamCluster.getClustersByAttributeAndThreshold(
             clusters, 'lastStreamAddedTime', test_time +
             timedelta(seconds=60), StreamCluster.BELOW_THRESHOLD)
     ])
示例#10
0
    def combineClusters(clusters, **twitter_stream_settings):
        def getHashtagSet(vector):
            return set([
                word for dimension in vector for word in dimension.split()
                if word.startswith('#')
            ])

        def getClusterInt(id):
            return int(id.split('_')[1])

        mergedClustersMap = {}
        for cluster in [
                clusters[v] for v in sorted(clusters, key=getClusterInt)
        ]:
            mergedClusterId = None
            for mergedCluster in mergedClustersMap.itervalues():
                clusterHashtags, mergedClusterHashtags = getHashtagSet(
                    cluster), getHashtagSet(mergedCluster)
                if len(
                        clusterHashtags.union(mergedClusterHashtags)
                ) and jaccard_distance(
                        clusterHashtags,
                        mergedClusterHashtags) <= 1 - twitter_stream_settings[
                            'cluster_merging_jaccard_distance_threshold']:
                    mergedCluster.mergeCluster(
                        cluster), mergedCluster.mergedClustersList.append(
                            cluster.clusterId)
                    mergedClusterId = mergedCluster.clusterId
                    break
            if mergedClusterId == None:
                mergedCluster = StreamCluster.getClusterObjectToMergeFrom(
                    cluster)
                mergedCluster.mergedClustersList = [cluster.clusterId]
                mergedClustersMap[mergedCluster.clusterId] = mergedCluster
        return mergedClustersMap
示例#11
0
 def getClusterAndUpdateExistingClusters(self, stream):
     predictedCluster = self.getClusterForDocument(stream)
     '''
     Do not remove this comment. Might need this if StreamCluster is used again in future.
     if predictedCluster!=None: self.clusters[predictedCluster].addStream(stream, **self.stream_settings)
     '''
     if predictedCluster != None:
         self.clusters[predictedCluster].addDocument(
             stream, **self.stream_settings)
     else:
         newCluster = StreamCluster(stream)
         newCluster.setSignatureUsingVectorPermutations(
             self.unitVector, self.vectorPermutations,
             self.phraseTextAndDimensionMap)
         for permutation in self.signaturePermutations:
             permutation.addDocument(newCluster)
         self.clusters[newCluster.clusterId] = newCluster
 def test_mergeCluster_compare_vector_lastStreamAddedTime_more_than_original_cluster(self):
     mergedCluster = StreamCluster.getClusterObjectToMergeFrom(self.cluster1)
     mergedCluster.mergeCluster(self.cluster2)
     self.assertEqual([self.stream1, self.stream2], list(mergedCluster.iterateDocumentsInCluster()))
     meanVectorForAllDocuments = Vector.getMeanVector([self.stream1, self.stream2])
     self.assertEqual(meanVectorForAllDocuments, mergedCluster)
     self.assertEqual([mergedCluster.docId, mergedCluster.docId], list(doc.clusterId for doc in mergedCluster.iterateDocumentsInCluster()))
     self.assertEqual(self.cluster2.lastStreamAddedTime, mergedCluster.lastStreamAddedTime)
 def setUp(self):
     self.tweet = {'user':{'screen_name': 'abc'}, 'id':10, 'text':'A project to cluster high-dimensional streams.', 'created_at': 'Tue Mar 01 05:59:59 +0000 2011'}
     m1 = Message(1, '', '', datetime.now())
     m1.vector=Vector({'#tcot':2,'dsf':4})
     self.cluster1 = StreamCluster(Stream(1, m1))
     m2 = Message(2, '', '', datetime.now())
     m2.vector=Vector({'#tcot':4})
     self.cluster2 = StreamCluster(Stream(2, m2))
     m3 = Message(3, '', '', datetime.now())
     m3.vector=Vector(Vector({'#tcot':2}))
     m4 = Message(4, '', '', datetime.now())
     m4.vector=Vector(Vector({'#tcot':2}))
     self.doc1 = Stream(1, m3)
     self.doc2 = Stream(2, m4)
     self.meanVectorForAllDocuments = Vector.getMeanVector([self.cluster1, self.cluster2, self.doc1, self.doc2])
     self.cluster1.addDocument(self.doc1)
     self.cluster2.addDocument(self.doc2)
示例#14
0
 def test_crowdSize(self):
     self.assertEqual(1, self.crowd.crowdSize)
     self.cluster.addDocument(Stream(2, self.m1))
     self.cluster.addDocument(Stream(3, self.m1))
     self.assertEqual(3, self.crowd.crowdSize)
     cluster = StreamCluster(Stream(3, self.m1))
     self.crowd.append(cluster, test_time + timedelta(days=2))
     self.assertNotEqual(4, self.crowd.crowdSize)
     self.assertEqual(3, self.crowd.crowdSize)
class TwitterCrowdsSpecificMethodsTests(unittest.TestCase):
    def setUp(self):
        self.tweet = {'user':{'screen_name': 'abc'}, 'id':10, 'text':'A project to cluster high-dimensional streams.', 'created_at': 'Tue Mar 01 05:59:59 +0000 2011'}
        m1 = Message(1, '', '', datetime.now())
        m1.vector=Vector({'#tcot':2,'dsf':4})
        self.cluster1 = StreamCluster(Stream(1, m1))
        m2 = Message(2, '', '', datetime.now())
        m2.vector=Vector({'#tcot':4})
        self.cluster2 = StreamCluster(Stream(2, m2))
        m3 = Message(3, '', '', datetime.now())
        m3.vector=Vector(Vector({'#tcot':2}))
        m4 = Message(4, '', '', datetime.now())
        m4.vector=Vector(Vector({'#tcot':2}))
        self.doc1 = Stream(1, m3)
        self.doc2 = Stream(2, m4)
        self.meanVectorForAllDocuments = Vector.getMeanVector([self.cluster1, self.cluster2, self.doc1, self.doc2])
        self.cluster1.addDocument(self.doc1)
        self.cluster2.addDocument(self.doc2)
    def test_convertTweetJSONToMessage(self):
        message = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage(self.tweet, **twitter_stream_settings)
        self.assertEqual({'project': 1, 'cluster': 1, 'streams': 1, 'highdimensional': 1}, message.vector)
    def test_combineClusters(self):
        clustersMap = {self.cluster1.clusterId: self.cluster1, self.cluster2.clusterId: self.cluster2}
        clustersMap = TwitterCrowdsSpecificMethods.combineClusters(clustersMap, **twitter_stream_settings)
        self.assertEqual(1, len(clustersMap))
        mergedCluster = clustersMap.values()[0]
        self.assertEqual([self.doc1, self.doc2], list(mergedCluster.iterateDocumentsInCluster()))
        self.assertEqual(self.meanVectorForAllDocuments, mergedCluster)
        self.assertEqual([mergedCluster.docId, mergedCluster.docId], list(doc.clusterId for doc in mergedCluster.iterateDocumentsInCluster()))
        self.assertEqual([self.cluster1.clusterId, self.cluster2.clusterId], mergedCluster.mergedClustersList)
    def test_getClusterInMapFormat(self):
        mergedCluster = StreamCluster.getClusterObjectToMergeFrom(self.cluster1)
        mergedCluster.mergedClustersList = [self.cluster1.clusterId]
        mergedCluster.lastStreamAddedTime = test_time
        mapReresentation = {'clusterId': mergedCluster.clusterId, 'lastStreamAddedTime':getStringRepresentationForTweetTimestamp(mergedCluster.lastStreamAddedTime), 'mergedClustersList': [self.cluster1.clusterId], 'streams': [self.doc1.docId], 'dimensions': {'#tcot':2, 'dsf':2}}
        self.assertEqual(mapReresentation, TwitterCrowdsSpecificMethods.getClusterInMapFormat(mergedCluster))
    def test_getClusterFromMapFormat(self):
        mapReresentation = {'clusterId': 1, 'mergedClustersList': [self.cluster1.clusterId], 'lastStreamAddedTime': getStringRepresentationForTweetTimestamp(test_time), 'streams': [self.doc1.docId], 'dimensions': {'#tcot':2, 'dsf':2}}
        cluster = TwitterCrowdsSpecificMethods.getClusterFromMapFormat(mapReresentation)
        self.assertEqual(1, cluster.clusterId)
        self.assertEqual([self.cluster1.clusterId], cluster.mergedClustersList)
        self.assertEqual([self.doc1.docId], cluster.documentsInCluster)
        self.assertEqual({'#tcot':2, 'dsf':2}, cluster)
        self.assertEqual(getStringRepresentationForTweetTimestamp(test_time), getStringRepresentationForTweetTimestamp(cluster.lastStreamAddedTime))
 def test_getClusterObjectToMergeFrom(self):
     documentsInCluster=list(self.cluster1.iterateDocumentsInCluster())
     mergedCluster = StreamCluster.getClusterObjectToMergeFrom(self.cluster1)
     self.assertEqual(test_time-timedelta(seconds=60), mergedCluster.lastStreamAddedTime)
     self.assertEqual(self.cluster1, mergedCluster)
     self.assertNotEqual(self.cluster1.clusterId, mergedCluster.clusterId)
     self.assertEqual(documentsInCluster, list(mergedCluster.iterateDocumentsInCluster()))
     self.assertEqual(self.cluster1.aggregateVector, mergedCluster.aggregateVector)
     self.assertEqual(self.cluster1.vectorWeights, mergedCluster.vectorWeights)
     self.assertEqual(self.cluster1.lastStreamAddedTime, mergedCluster.lastStreamAddedTime)
示例#17
0
 def setUp(self):
     self.m1 = Message(1, 'sdf',
                       'A project to cluster high-dimensional streams.',
                       test_time - timedelta(seconds=60))
     self.m1.vector = Vector({1: 2, 2: 4})
     self.stream1 = Stream(1, self.m1)
     self.m2 = Message(2, 'sdf',
                       'A project to cluster high-dimensional streams.',
                       test_time)
     self.m2.vector = Vector({2: 4})
     self.stream2 = Stream(2, self.m2)
     self.m3 = Message(3, 'sdf',
                       'A project to cluster high-dimensional streams.',
                       test_time + timedelta(seconds=60))
     self.m3.vector = Vector({2: 4})
     self.stream3 = Stream(3, self.m3)
     self.cluster1 = StreamCluster(self.stream1)
     self.cluster2 = StreamCluster(self.stream2)
     self.cluster3 = StreamCluster(self.stream3)
示例#18
0
 def test_mergeCluster_lastStreamAddedTime_lesser_than_original_cluster(
         self):
     mergedCluster = StreamCluster.getClusterObjectToMergeFrom(
         self.cluster3)
     mergedCluster.mergeCluster(self.cluster1)
     self.assertTrue(self.cluster1.lastStreamAddedTime <
                     self.cluster3.lastStreamAddedTime)
     self.assertEqual(self.cluster3.lastStreamAddedTime,
                      mergedCluster.lastStreamAddedTime)
     self.assertNotEqual(self.cluster1.lastStreamAddedTime,
                         mergedCluster.lastStreamAddedTime)
class CrowdTests(unittest.TestCase):
    def setUp(self):
        self.m1 = Message(1, 'sdf', 'A project to cluster high-dimensional streams.', test_time-timedelta(seconds=60))
        self.m1.vector=Vector({1:1.,2:3.})
        self.stream = Stream(1, self.m1)
        self.cluster = StreamCluster(self.stream)
        self.crowd = Crowd(self.cluster, test_time)
    def test_intitialization(self):
        self.assertEqual(self.cluster.clusterId, self.crowd.crowdId)
    def test_append(self):
        self.crowd.append(self.cluster, test_time+timedelta(days=1))
        self.assertEqual([GeneralMethods.getEpochFromDateTimeObject(test_time), GeneralMethods.getEpochFromDateTimeObject(test_time+timedelta(days=1))], sorted(self.crowd.clusters.keys()))
        self.assertEqual(StreamCluster, type(self.crowd.clusters[GeneralMethods.getEpochFromDateTimeObject(test_time)]))
        self.assertEqual(2, self.crowd.lifespan)
        self.assertEqual(getStringRepresentationForTweetTimestamp(test_time), getStringRepresentationForTweetTimestamp(self.crowd.startTime))
        self.assertEqual(getStringRepresentationForTweetTimestamp(test_time+timedelta(days=1)), getStringRepresentationForTweetTimestamp(self.crowd.endTime))
    def test_maxClusterSize(self):
        self.assertEqual(1, self.crowd.maxClusterSize)
        message2 = Message(4, 'sdf', 'A project to cluster high-dimensional streams.', test_time)
        message2.vector=Vector({2:4})
        stream2 = Stream(4, message2)
        self.cluster.addDocument(stream2)
        self.assertEqual(2, self.crowd.maxClusterSize)
    def test_crowdSize(self):
        self.assertEqual(1, self.crowd.crowdSize)
        self.cluster.addDocument(Stream(2, self.m1));self.cluster.addDocument(Stream(3, self.m1))
        self.assertEqual(3, self.crowd.crowdSize)
        cluster = StreamCluster(Stream(3, self.m1))
        self.crowd.append(cluster, test_time+timedelta(days=2))
        self.assertNotEqual(4, self.crowd.crowdSize)
        self.assertEqual(3, self.crowd.crowdSize)
示例#20
0
 def clusterFilteringMethod(hdStreamClusteringObject, currentMessageTime):
     for cluster in StreamCluster.getClustersByAttributeAndThreshold(
             hdStreamClusteringObject.clusters.values(),
             hdStreamClusteringObject.
             stream_settings['cluster_filter_attribute'],
             hdStreamClusteringObject.
             stream_settings['cluster_filter_threshold'],
             StreamCluster.BELOW_THRESHOLD):
         del hdStreamClusteringObject.clusters[cluster.clusterId]
     for cluster in StreamCluster.getClustersByAttributeAndThreshold(
             hdStreamClusteringObject.clusters.values(),
             'lastStreamAddedTime',
             currentMessageTime - hdStreamClusteringObject.
             stream_settings['cluster_inactivity_time_in_seconds'],
             StreamCluster.BELOW_THRESHOLD):
         del hdStreamClusteringObject.clusters[cluster.clusterId]
     if hdStreamClusteringObject.combineClustersMethod != None:
         hdStreamClusteringObject.clusters = hdStreamClusteringObject.combineClustersMethod(
             hdStreamClusteringObject.clusters,
             **hdStreamClusteringObject.stream_settings)
     DataStreamMethods._resetClustersInSignatureTries(
         hdStreamClusteringObject, currentMessageTime)
 def setUp(self): 
     self.m1 = Message(1, 'sdf', 'A project to cluster high-dimensional streams.', test_time-timedelta(seconds=60))
     self.m1.vector=Vector({1:2,2:4})
     self.stream1 = Stream(1, self.m1)
     self.m2 = Message(2, 'sdf', 'A project to cluster high-dimensional streams.', test_time)
     self.m2.vector=Vector({2:4})
     self.stream2 = Stream(2, self.m2)
     self.m3 = Message(3, 'sdf', 'A project to cluster high-dimensional streams.', test_time+timedelta(seconds=60))
     self.m3.vector=Vector({2:4})
     self.stream3 = Stream(3, self.m3)
     self.cluster1 = StreamCluster(self.stream1)
     self.cluster2 = StreamCluster(self.stream2)
     self.cluster3 = StreamCluster(self.stream3)
示例#22
0
 def test_getClusterObjectToMergeFrom(self):
     documentsInCluster = list(self.cluster1.iterateDocumentsInCluster())
     mergedCluster = StreamCluster.getClusterObjectToMergeFrom(
         self.cluster1)
     self.assertEqual(test_time - timedelta(seconds=60),
                      mergedCluster.lastStreamAddedTime)
     self.assertEqual(self.cluster1, mergedCluster)
     self.assertNotEqual(self.cluster1.clusterId, mergedCluster.clusterId)
     self.assertEqual(documentsInCluster,
                      list(mergedCluster.iterateDocumentsInCluster()))
     self.assertEqual(self.cluster1.aggregateVector,
                      mergedCluster.aggregateVector)
     self.assertEqual(self.cluster1.vectorWeights,
                      mergedCluster.vectorWeights)
     self.assertEqual(self.cluster1.lastStreamAddedTime,
                      mergedCluster.lastStreamAddedTime)
示例#23
0
 def test_mergeCluster_compare_vector_lastStreamAddedTime_more_than_original_cluster(
         self):
     mergedCluster = StreamCluster.getClusterObjectToMergeFrom(
         self.cluster1)
     mergedCluster.mergeCluster(self.cluster2)
     self.assertEqual([self.stream1, self.stream2],
                      list(mergedCluster.iterateDocumentsInCluster()))
     meanVectorForAllDocuments = Vector.getMeanVector(
         [self.stream1, self.stream2])
     self.assertEqual(meanVectorForAllDocuments, mergedCluster)
     self.assertEqual(
         [mergedCluster.docId, mergedCluster.docId],
         list(doc.clusterId
              for doc in mergedCluster.iterateDocumentsInCluster()))
     self.assertEqual(self.cluster2.lastStreamAddedTime,
                      mergedCluster.lastStreamAddedTime)
示例#24
0
 def clusterAnalysisMethod(hdStreamClusteringObject, currentMessageTime):
     print currentMessageTime
     print '\n\n\nEntering:', currentMessageTime, len(
         hdStreamClusteringObject.phraseTextAndDimensionMap), len(
             hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(
                 hdStreamClusteringObject.clusters)
     for cluster, _ in sorted(StreamCluster.iterateByAttribute(
             hdStreamClusteringObject.clusters.values(), 'length'),
                              key=itemgetter(1),
                              reverse=True)[:1]:
         print cluster.clusterId, cluster.length, [
             stream.docId for stream in cluster.iterateDocumentsInCluster()
         ][:5], cluster.getTopDimensions(numberOfFeatures=5)
     print 'Leaving: ', currentMessageTime, len(
         hdStreamClusteringObject.phraseTextAndDimensionMap), len(
             hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(
                 hdStreamClusteringObject.clusters)
 def combineClusters(clusters, **twitter_stream_settings):
     def getHashtagSet(vector): return set([word for dimension in vector for word in dimension.split() if word.startswith('#')])
     def getClusterInt(id): return int(id.split('_')[1])
     mergedClustersMap = {}
     for cluster in [clusters[v] for v in sorted(clusters, key=getClusterInt)]:
         mergedClusterId = None
         for mergedCluster in mergedClustersMap.itervalues():
             clusterHashtags, mergedClusterHashtags = getHashtagSet(cluster), getHashtagSet(mergedCluster)
             if len(clusterHashtags.union(mergedClusterHashtags)) and jaccard_distance(clusterHashtags, mergedClusterHashtags) <= 1-twitter_stream_settings['cluster_merging_jaccard_distance_threshold']: 
                 mergedCluster.mergeCluster(cluster), mergedCluster.mergedClustersList.append(cluster.clusterId)
                 mergedClusterId = mergedCluster.clusterId
                 break
         if mergedClusterId==None:
             mergedCluster = StreamCluster.getClusterObjectToMergeFrom(cluster)
             mergedCluster.mergedClustersList = [cluster.clusterId]
             mergedClustersMap[mergedCluster.clusterId]=mergedCluster
     return mergedClustersMap
示例#26
0
 def getClusterFromMapFormat(clusterMap):
     dummyMessage = Message(1, '', '', datetime.now())
     dummyMessage.vector = Vector({})
     dummyStream = Stream(1, dummyMessage)
     cluster = StreamCluster(dummyStream)
     cluster.clusterId = clusterMap['clusterId']
     cluster.lastStreamAddedTime = getDateTimeObjectFromTweetTimestamp(
         clusterMap['lastStreamAddedTime'])
     cluster.mergedClustersList = clusterMap['mergedClustersList']
     cluster.documentsInCluster = clusterMap['streams']
     for k, v in clusterMap['dimensions'].iteritems():
         cluster[k] = v
     return cluster
 def setUp(self):
     self.m1 = Message(1, 'sdf', 'A project to cluster high-dimensional streams.', test_time-timedelta(seconds=60))
     self.m1.vector=Vector({1:1.,2:3.})
     self.stream = Stream(1, self.m1)
     self.cluster = StreamCluster(self.stream)
     self.crowd = Crowd(self.cluster, test_time)
 def test_getClusterInMapFormat(self):
     mergedCluster = StreamCluster.getClusterObjectToMergeFrom(self.cluster1)
     mergedCluster.mergedClustersList = [self.cluster1.clusterId]
     mergedCluster.lastStreamAddedTime = test_time
     mapReresentation = {'clusterId': mergedCluster.clusterId, 'lastStreamAddedTime':getStringRepresentationForTweetTimestamp(mergedCluster.lastStreamAddedTime), 'mergedClustersList': [self.cluster1.clusterId], 'streams': [self.doc1.docId], 'dimensions': {'#tcot':2, 'dsf':2}}
     self.assertEqual(mapReresentation, TwitterCrowdsSpecificMethods.getClusterInMapFormat(mergedCluster))
 def test_mergeCluster_lastStreamAddedTime_lesser_than_original_cluster(self):
     mergedCluster = StreamCluster.getClusterObjectToMergeFrom(self.cluster3)
     mergedCluster.mergeCluster(self.cluster1)
     self.assertTrue(self.cluster1.lastStreamAddedTime<self.cluster3.lastStreamAddedTime)
     self.assertEqual(self.cluster3.lastStreamAddedTime, mergedCluster.lastStreamAddedTime)
     self.assertNotEqual(self.cluster1.lastStreamAddedTime, mergedCluster.lastStreamAddedTime)
 def writeClusters(hdStreamClusteringObject, currentMessageTime):
     print '\n\n\nEntering:', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters)
     iterationData = {'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                      'clusters': map(TwitterCrowdsSpecificMethods.getClusterInMapFormat, [cluster for cluster, _ in sorted(StreamCluster.iterateByAttribute(hdStreamClusteringObject.clusters.values(), 'length'), key=itemgetter(1), reverse=True)]),
                      'settings': Settings.getSerialzedObject(hdStreamClusteringObject.stream_settings)
                      }
     FileIO.writeToFileAsJson(iterationData, hdStreamClusteringObject.stream_settings['lsh_clusters_folder']+FileIO.getFileByDay(currentMessageTime))
     print 'Leaving: ', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters)
示例#31
0
class StreamClusterTests(unittest.TestCase):
    def setUp(self):
        self.m1 = Message(1, 'sdf',
                          'A project to cluster high-dimensional streams.',
                          test_time - timedelta(seconds=60))
        self.m1.vector = Vector({1: 2, 2: 4})
        self.stream1 = Stream(1, self.m1)
        self.m2 = Message(2, 'sdf',
                          'A project to cluster high-dimensional streams.',
                          test_time)
        self.m2.vector = Vector({2: 4})
        self.stream2 = Stream(2, self.m2)
        self.m3 = Message(3, 'sdf',
                          'A project to cluster high-dimensional streams.',
                          test_time + timedelta(seconds=60))
        self.m3.vector = Vector({2: 4})
        self.stream3 = Stream(3, self.m3)
        self.cluster1 = StreamCluster(self.stream1)
        self.cluster2 = StreamCluster(self.stream2)
        self.cluster3 = StreamCluster(self.stream3)

    def test_initialization(self):
        self.assertEqual(test_time - timedelta(seconds=60),
                         self.cluster1.lastStreamAddedTime)
        self.assertEqual(test_time, self.cluster2.lastStreamAddedTime)

    def test_getClusterObjectToMergeFrom(self):
        documentsInCluster = list(self.cluster1.iterateDocumentsInCluster())
        mergedCluster = StreamCluster.getClusterObjectToMergeFrom(
            self.cluster1)
        self.assertEqual(test_time - timedelta(seconds=60),
                         mergedCluster.lastStreamAddedTime)
        self.assertEqual(self.cluster1, mergedCluster)
        self.assertNotEqual(self.cluster1.clusterId, mergedCluster.clusterId)
        self.assertEqual(documentsInCluster,
                         list(mergedCluster.iterateDocumentsInCluster()))
        self.assertEqual(self.cluster1.aggregateVector,
                         mergedCluster.aggregateVector)
        self.assertEqual(self.cluster1.vectorWeights,
                         mergedCluster.vectorWeights)
        self.assertEqual(self.cluster1.lastStreamAddedTime,
                         mergedCluster.lastStreamAddedTime)

    def test_mergeCluster_compare_vector_lastStreamAddedTime_more_than_original_cluster(
            self):
        mergedCluster = StreamCluster.getClusterObjectToMergeFrom(
            self.cluster1)
        mergedCluster.mergeCluster(self.cluster2)
        self.assertEqual([self.stream1, self.stream2],
                         list(mergedCluster.iterateDocumentsInCluster()))
        meanVectorForAllDocuments = Vector.getMeanVector(
            [self.stream1, self.stream2])
        self.assertEqual(meanVectorForAllDocuments, mergedCluster)
        self.assertEqual(
            [mergedCluster.docId, mergedCluster.docId],
            list(doc.clusterId
                 for doc in mergedCluster.iterateDocumentsInCluster()))
        self.assertEqual(self.cluster2.lastStreamAddedTime,
                         mergedCluster.lastStreamAddedTime)

    def test_mergeCluster_lastStreamAddedTime_lesser_than_original_cluster(
            self):
        mergedCluster = StreamCluster.getClusterObjectToMergeFrom(
            self.cluster3)
        mergedCluster.mergeCluster(self.cluster1)
        self.assertTrue(self.cluster1.lastStreamAddedTime <
                        self.cluster3.lastStreamAddedTime)
        self.assertEqual(self.cluster3.lastStreamAddedTime,
                         mergedCluster.lastStreamAddedTime)
        self.assertNotEqual(self.cluster1.lastStreamAddedTime,
                            mergedCluster.lastStreamAddedTime)

    def test_addDocument(self):
        message1 = Message(3, 'sdf',
                           'A project to cluster high-dimensional streams.',
                           test_time)
        message1.vector = Vector({3: 4})
        stream1 = Stream(3, message1)
        message2 = Message(4, 'sdf',
                           'A project to cluster high-dimensional streams.',
                           test_time)
        message2.vector = Vector({2: 4})
        stream2 = Stream(4, message2)
        self.assertNotEqual(test_time, self.cluster1.lastStreamAddedTime)
        self.cluster1.addDocument(stream1)
        self.assertEqual(test_time, self.cluster1.lastStreamAddedTime)
        # Test if cluster id is set.
        self.assertEqual(self.cluster1.clusterId, stream1.clusterId)
        # Test that cluster mean is updated.
        self.assertEqual({1: 2 / 2., 2: 2., 3: 2.}, self.cluster1)
        # Test that cluster aggrefate is updated.
        self.assertEqual({1: 2, 2: 4, 3: 4}, self.cluster1.aggregateVector)
        # Test that document is added to cluster documents.
        self.assertEqual(stream1,
                         self.cluster1.documentsInCluster[stream1.docId])
        self.cluster1.addDocument(stream2)
        self.assertEqual(3, self.cluster1.vectorWeights)
        self.assertEqual({1: 2 / 3., 2: 8 / 3., 3: 4 / 3.}, self.cluster1)
        self.assertEqual({1: 2, 2: 8, 3: 4}, self.cluster1.aggregateVector)

    def test_clustersIteration(self):
        clusters = [self.cluster1, self.cluster2, self.cluster3]
        self.assertEqual([self.cluster1], [
            cluster
            for cluster in StreamCluster.getClustersByAttributeAndThreshold(
                clusters, 'lastStreamAddedTime', test_time,
                StreamCluster.BELOW_THRESHOLD)
        ])
        self.assertEqual([self.cluster1, self.cluster2], [
            cluster
            for cluster in StreamCluster.getClustersByAttributeAndThreshold(
                clusters, 'lastStreamAddedTime', test_time +
                timedelta(seconds=60), StreamCluster.BELOW_THRESHOLD)
        ])
 def clusterAnalysisMethod(hdStreamClusteringObject, currentMessageTime):
     print currentMessageTime
     print '\n\n\nEntering:', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters)
     for cluster, _ in sorted(StreamCluster.iterateByAttribute(hdStreamClusteringObject.clusters.values(), 'length'), key=itemgetter(1), reverse=True)[:1]:
         print cluster.clusterId, cluster.length, [stream.docId for stream in cluster.iterateDocumentsInCluster()][:5], cluster.getTopDimensions(numberOfFeatures=5)
     print 'Leaving: ', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters)
class StreamClusterTests(unittest.TestCase):
    def setUp(self): 
        self.m1 = Message(1, 'sdf', 'A project to cluster high-dimensional streams.', test_time-timedelta(seconds=60))
        self.m1.vector=Vector({1:2,2:4})
        self.stream1 = Stream(1, self.m1)
        self.m2 = Message(2, 'sdf', 'A project to cluster high-dimensional streams.', test_time)
        self.m2.vector=Vector({2:4})
        self.stream2 = Stream(2, self.m2)
        self.m3 = Message(3, 'sdf', 'A project to cluster high-dimensional streams.', test_time+timedelta(seconds=60))
        self.m3.vector=Vector({2:4})
        self.stream3 = Stream(3, self.m3)
        self.cluster1 = StreamCluster(self.stream1)
        self.cluster2 = StreamCluster(self.stream2)
        self.cluster3 = StreamCluster(self.stream3)
    def test_initialization(self):
        self.assertEqual(test_time-timedelta(seconds=60), self.cluster1.lastStreamAddedTime)
        self.assertEqual(test_time, self.cluster2.lastStreamAddedTime)
    def test_getClusterObjectToMergeFrom(self):
        documentsInCluster=list(self.cluster1.iterateDocumentsInCluster())
        mergedCluster = StreamCluster.getClusterObjectToMergeFrom(self.cluster1)
        self.assertEqual(test_time-timedelta(seconds=60), mergedCluster.lastStreamAddedTime)
        self.assertEqual(self.cluster1, mergedCluster)
        self.assertNotEqual(self.cluster1.clusterId, mergedCluster.clusterId)
        self.assertEqual(documentsInCluster, list(mergedCluster.iterateDocumentsInCluster()))
        self.assertEqual(self.cluster1.aggregateVector, mergedCluster.aggregateVector)
        self.assertEqual(self.cluster1.vectorWeights, mergedCluster.vectorWeights)
        self.assertEqual(self.cluster1.lastStreamAddedTime, mergedCluster.lastStreamAddedTime)
    def test_mergeCluster_compare_vector_lastStreamAddedTime_more_than_original_cluster(self):
        mergedCluster = StreamCluster.getClusterObjectToMergeFrom(self.cluster1)
        mergedCluster.mergeCluster(self.cluster2)
        self.assertEqual([self.stream1, self.stream2], list(mergedCluster.iterateDocumentsInCluster()))
        meanVectorForAllDocuments = Vector.getMeanVector([self.stream1, self.stream2])
        self.assertEqual(meanVectorForAllDocuments, mergedCluster)
        self.assertEqual([mergedCluster.docId, mergedCluster.docId], list(doc.clusterId for doc in mergedCluster.iterateDocumentsInCluster()))
        self.assertEqual(self.cluster2.lastStreamAddedTime, mergedCluster.lastStreamAddedTime)
    def test_mergeCluster_lastStreamAddedTime_lesser_than_original_cluster(self):
        mergedCluster = StreamCluster.getClusterObjectToMergeFrom(self.cluster3)
        mergedCluster.mergeCluster(self.cluster1)
        self.assertTrue(self.cluster1.lastStreamAddedTime<self.cluster3.lastStreamAddedTime)
        self.assertEqual(self.cluster3.lastStreamAddedTime, mergedCluster.lastStreamAddedTime)
        self.assertNotEqual(self.cluster1.lastStreamAddedTime, mergedCluster.lastStreamAddedTime)
    def test_addDocument(self):
        message1 = Message(3, 'sdf', 'A project to cluster high-dimensional streams.', test_time)
        message1.vector=Vector({3:4})
        stream1 = Stream(3, message1)
        message2 = Message(4, 'sdf', 'A project to cluster high-dimensional streams.', test_time)
        message2.vector=Vector({2:4})
        stream2 = Stream(4, message2)
        self.assertNotEqual(test_time, self.cluster1.lastStreamAddedTime)
        self.cluster1.addDocument(stream1)
        self.assertEqual(test_time, self.cluster1.lastStreamAddedTime)
        # Test if cluster id is set.
        self.assertEqual(self.cluster1.clusterId, stream1.clusterId)
        # Test that cluster mean is updated.
        self.assertEqual({1:2/2.,2:2.,3:2.}, self.cluster1)
        # Test that cluster aggrefate is updated.
        self.assertEqual({1:2,2:4,3:4}, self.cluster1.aggregateVector)
        # Test that document is added to cluster documents.
        self.assertEqual(stream1, self.cluster1.documentsInCluster[stream1.docId])
        self.cluster1.addDocument(stream2)
        self.assertEqual(3, self.cluster1.vectorWeights)
        self.assertEqual({1:2/3.,2:8/3.,3:4/3.}, self.cluster1)
        self.assertEqual({1:2,2:8,3:4}, self.cluster1.aggregateVector)
    def test_clustersIteration(self):
        clusters = [self.cluster1, self.cluster2, self.cluster3]
        self.assertEqual([self.cluster1],
                          [cluster for cluster in StreamCluster.getClustersByAttributeAndThreshold(clusters, 'lastStreamAddedTime', test_time ,StreamCluster.BELOW_THRESHOLD)]
                        )
        self.assertEqual([self.cluster1, self.cluster2],
                          [cluster for cluster in StreamCluster.getClustersByAttributeAndThreshold(clusters, 'lastStreamAddedTime', test_time+timedelta(seconds=60) ,StreamCluster.BELOW_THRESHOLD)]
                        )