def clusterFilteringMethod(hdStreamClusteringObject, currentMessageTime): for cluster in StreamCluster.getClustersByAttributeAndThreshold(hdStreamClusteringObject.clusters.values(), hdStreamClusteringObject.stream_settings['cluster_filter_attribute'], hdStreamClusteringObject.stream_settings['cluster_filter_threshold'], StreamCluster.BELOW_THRESHOLD): del hdStreamClusteringObject.clusters[cluster.clusterId] for cluster in StreamCluster.getClustersByAttributeAndThreshold(hdStreamClusteringObject.clusters.values(), 'lastStreamAddedTime', currentMessageTime-hdStreamClusteringObject.stream_settings['cluster_inactivity_time_in_seconds'], StreamCluster.BELOW_THRESHOLD): del hdStreamClusteringObject.clusters[cluster.clusterId] if hdStreamClusteringObject.combineClustersMethod!=None: hdStreamClusteringObject.clusters=hdStreamClusteringObject.combineClustersMethod(hdStreamClusteringObject.clusters, **hdStreamClusteringObject.stream_settings) DataStreamMethods._resetClustersInSignatureTries(hdStreamClusteringObject, currentMessageTime)
def getClusterAndUpdateExistingClusters(self, stream): predictedCluster = self.getClusterForDocument(stream) if predictedCluster!=None: self.clusters[predictedCluster].addDocument(stream, **self.stream_settings) else: newCluster = StreamCluster(stream) newCluster.setSignatureUsingVectorPermutations(self.unitVector, self.vectorPermutations, self.phraseTextAndDimensionMap) for permutation in self.signaturePermutations: permutation.addDocument(newCluster) self.clusters[newCluster.clusterId] = newCluster
def setUp(self): self.m1 = Message(1, 'sdf', 'A project to cluster high-dimensional streams.', test_time - timedelta(seconds=60)) self.m1.vector = Vector({1: 1., 2: 3.}) self.stream = Stream(1, self.m1) self.cluster = StreamCluster(self.stream) self.crowd = Crowd(self.cluster, test_time)
def test_clustersIteration(self): clusters = [self.cluster1, self.cluster2, self.cluster3] self.assertEqual([self.cluster1], [cluster for cluster in StreamCluster.getClustersByAttributeAndThreshold(clusters, 'lastStreamAddedTime', test_time ,StreamCluster.BELOW_THRESHOLD)] ) self.assertEqual([self.cluster1, self.cluster2], [cluster for cluster in StreamCluster.getClustersByAttributeAndThreshold(clusters, 'lastStreamAddedTime', test_time+timedelta(seconds=60) ,StreamCluster.BELOW_THRESHOLD)] )
def getClusterFromMapFormat(clusterMap): dummyMessage = Message(1, '', '', datetime.now()) dummyMessage.vector=Vector({}) dummyStream=Stream(1, dummyMessage) cluster = StreamCluster(dummyStream) cluster.clusterId = clusterMap['clusterId'] cluster.lastStreamAddedTime = getDateTimeObjectFromTweetTimestamp(clusterMap['lastStreamAddedTime']) cluster.mergedClustersList = clusterMap['mergedClustersList'] cluster.documentsInCluster = clusterMap['streams'] for k,v in clusterMap['dimensions'].iteritems(): cluster[k]=v return cluster
def getClusterAndUpdateExistingClusters(self, stream): predictedCluster = self.getClusterForDocument(stream) ''' Do not remove this comment. Might need this if StreamCluster is used again in future. if predictedCluster!=None: self.clusters[predictedCluster].addStream(stream, **self.stream_settings) ''' if predictedCluster!=None: self.clusters[predictedCluster].addDocument(stream, **self.stream_settings) else: newCluster = StreamCluster(stream) newCluster.setSignatureUsingVectorPermutations(self.unitVector, self.vectorPermutations, self.phraseTextAndDimensionMap) for permutation in self.signaturePermutations: permutation.addDocument(newCluster) self.clusters[newCluster.clusterId] = newCluster
class CrowdTests(unittest.TestCase): def setUp(self): self.m1 = Message(1, 'sdf', 'A project to cluster high-dimensional streams.', test_time - timedelta(seconds=60)) self.m1.vector = Vector({1: 1., 2: 3.}) self.stream = Stream(1, self.m1) self.cluster = StreamCluster(self.stream) self.crowd = Crowd(self.cluster, test_time) def test_intitialization(self): self.assertEqual(self.cluster.clusterId, self.crowd.crowdId) def test_append(self): self.crowd.append(self.cluster, test_time + timedelta(days=1)) self.assertEqual([ GeneralMethods.getEpochFromDateTimeObject(test_time), GeneralMethods.getEpochFromDateTimeObject(test_time + timedelta(days=1)) ], sorted(self.crowd.clusters.keys())) self.assertEqual( StreamCluster, type(self.crowd.clusters[GeneralMethods.getEpochFromDateTimeObject( test_time)])) self.assertEqual(2, self.crowd.lifespan) self.assertEqual( getStringRepresentationForTweetTimestamp(test_time), getStringRepresentationForTweetTimestamp(self.crowd.startTime)) self.assertEqual( getStringRepresentationForTweetTimestamp(test_time + timedelta(days=1)), getStringRepresentationForTweetTimestamp(self.crowd.endTime)) def test_maxClusterSize(self): self.assertEqual(1, self.crowd.maxClusterSize) message2 = Message(4, 'sdf', 'A project to cluster high-dimensional streams.', test_time) message2.vector = Vector({2: 4}) stream2 = Stream(4, message2) self.cluster.addDocument(stream2) self.assertEqual(2, self.crowd.maxClusterSize) def test_crowdSize(self): self.assertEqual(1, self.crowd.crowdSize) self.cluster.addDocument(Stream(2, self.m1)) self.cluster.addDocument(Stream(3, self.m1)) self.assertEqual(3, self.crowd.crowdSize) cluster = StreamCluster(Stream(3, self.m1)) self.crowd.append(cluster, test_time + timedelta(days=2)) self.assertNotEqual(4, self.crowd.crowdSize) self.assertEqual(3, self.crowd.crowdSize)
def getClusterAndUpdateExistingClusters(self, stream): predictedCluster = self.getClusterForDocument(stream) if predictedCluster != None: self.clusters[predictedCluster].addDocument( stream, **self.stream_settings) else: newCluster = StreamCluster(stream) newCluster.setSignatureUsingVectorPermutations( self.unitVector, self.vectorPermutations, self.phraseTextAndDimensionMap) for permutation in self.signaturePermutations: permutation.addDocument(newCluster) self.clusters[newCluster.clusterId] = newCluster
def test_clustersIteration(self): clusters = [self.cluster1, self.cluster2, self.cluster3] self.assertEqual([self.cluster1], [ cluster for cluster in StreamCluster.getClustersByAttributeAndThreshold( clusters, 'lastStreamAddedTime', test_time, StreamCluster.BELOW_THRESHOLD) ]) self.assertEqual([self.cluster1, self.cluster2], [ cluster for cluster in StreamCluster.getClustersByAttributeAndThreshold( clusters, 'lastStreamAddedTime', test_time + timedelta(seconds=60), StreamCluster.BELOW_THRESHOLD) ])
def combineClusters(clusters, **twitter_stream_settings): def getHashtagSet(vector): return set([ word for dimension in vector for word in dimension.split() if word.startswith('#') ]) def getClusterInt(id): return int(id.split('_')[1]) mergedClustersMap = {} for cluster in [ clusters[v] for v in sorted(clusters, key=getClusterInt) ]: mergedClusterId = None for mergedCluster in mergedClustersMap.itervalues(): clusterHashtags, mergedClusterHashtags = getHashtagSet( cluster), getHashtagSet(mergedCluster) if len( clusterHashtags.union(mergedClusterHashtags) ) and jaccard_distance( clusterHashtags, mergedClusterHashtags) <= 1 - twitter_stream_settings[ 'cluster_merging_jaccard_distance_threshold']: mergedCluster.mergeCluster( cluster), mergedCluster.mergedClustersList.append( cluster.clusterId) mergedClusterId = mergedCluster.clusterId break if mergedClusterId == None: mergedCluster = StreamCluster.getClusterObjectToMergeFrom( cluster) mergedCluster.mergedClustersList = [cluster.clusterId] mergedClustersMap[mergedCluster.clusterId] = mergedCluster return mergedClustersMap
def getClusterAndUpdateExistingClusters(self, stream): predictedCluster = self.getClusterForDocument(stream) ''' Do not remove this comment. Might need this if StreamCluster is used again in future. if predictedCluster!=None: self.clusters[predictedCluster].addStream(stream, **self.stream_settings) ''' if predictedCluster != None: self.clusters[predictedCluster].addDocument( stream, **self.stream_settings) else: newCluster = StreamCluster(stream) newCluster.setSignatureUsingVectorPermutations( self.unitVector, self.vectorPermutations, self.phraseTextAndDimensionMap) for permutation in self.signaturePermutations: permutation.addDocument(newCluster) self.clusters[newCluster.clusterId] = newCluster
def test_mergeCluster_compare_vector_lastStreamAddedTime_more_than_original_cluster(self): mergedCluster = StreamCluster.getClusterObjectToMergeFrom(self.cluster1) mergedCluster.mergeCluster(self.cluster2) self.assertEqual([self.stream1, self.stream2], list(mergedCluster.iterateDocumentsInCluster())) meanVectorForAllDocuments = Vector.getMeanVector([self.stream1, self.stream2]) self.assertEqual(meanVectorForAllDocuments, mergedCluster) self.assertEqual([mergedCluster.docId, mergedCluster.docId], list(doc.clusterId for doc in mergedCluster.iterateDocumentsInCluster())) self.assertEqual(self.cluster2.lastStreamAddedTime, mergedCluster.lastStreamAddedTime)
def setUp(self): self.tweet = {'user':{'screen_name': 'abc'}, 'id':10, 'text':'A project to cluster high-dimensional streams.', 'created_at': 'Tue Mar 01 05:59:59 +0000 2011'} m1 = Message(1, '', '', datetime.now()) m1.vector=Vector({'#tcot':2,'dsf':4}) self.cluster1 = StreamCluster(Stream(1, m1)) m2 = Message(2, '', '', datetime.now()) m2.vector=Vector({'#tcot':4}) self.cluster2 = StreamCluster(Stream(2, m2)) m3 = Message(3, '', '', datetime.now()) m3.vector=Vector(Vector({'#tcot':2})) m4 = Message(4, '', '', datetime.now()) m4.vector=Vector(Vector({'#tcot':2})) self.doc1 = Stream(1, m3) self.doc2 = Stream(2, m4) self.meanVectorForAllDocuments = Vector.getMeanVector([self.cluster1, self.cluster2, self.doc1, self.doc2]) self.cluster1.addDocument(self.doc1) self.cluster2.addDocument(self.doc2)
def test_crowdSize(self): self.assertEqual(1, self.crowd.crowdSize) self.cluster.addDocument(Stream(2, self.m1)) self.cluster.addDocument(Stream(3, self.m1)) self.assertEqual(3, self.crowd.crowdSize) cluster = StreamCluster(Stream(3, self.m1)) self.crowd.append(cluster, test_time + timedelta(days=2)) self.assertNotEqual(4, self.crowd.crowdSize) self.assertEqual(3, self.crowd.crowdSize)
class TwitterCrowdsSpecificMethodsTests(unittest.TestCase): def setUp(self): self.tweet = {'user':{'screen_name': 'abc'}, 'id':10, 'text':'A project to cluster high-dimensional streams.', 'created_at': 'Tue Mar 01 05:59:59 +0000 2011'} m1 = Message(1, '', '', datetime.now()) m1.vector=Vector({'#tcot':2,'dsf':4}) self.cluster1 = StreamCluster(Stream(1, m1)) m2 = Message(2, '', '', datetime.now()) m2.vector=Vector({'#tcot':4}) self.cluster2 = StreamCluster(Stream(2, m2)) m3 = Message(3, '', '', datetime.now()) m3.vector=Vector(Vector({'#tcot':2})) m4 = Message(4, '', '', datetime.now()) m4.vector=Vector(Vector({'#tcot':2})) self.doc1 = Stream(1, m3) self.doc2 = Stream(2, m4) self.meanVectorForAllDocuments = Vector.getMeanVector([self.cluster1, self.cluster2, self.doc1, self.doc2]) self.cluster1.addDocument(self.doc1) self.cluster2.addDocument(self.doc2) def test_convertTweetJSONToMessage(self): message = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage(self.tweet, **twitter_stream_settings) self.assertEqual({'project': 1, 'cluster': 1, 'streams': 1, 'highdimensional': 1}, message.vector) def test_combineClusters(self): clustersMap = {self.cluster1.clusterId: self.cluster1, self.cluster2.clusterId: self.cluster2} clustersMap = TwitterCrowdsSpecificMethods.combineClusters(clustersMap, **twitter_stream_settings) self.assertEqual(1, len(clustersMap)) mergedCluster = clustersMap.values()[0] self.assertEqual([self.doc1, self.doc2], list(mergedCluster.iterateDocumentsInCluster())) self.assertEqual(self.meanVectorForAllDocuments, mergedCluster) self.assertEqual([mergedCluster.docId, mergedCluster.docId], list(doc.clusterId for doc in mergedCluster.iterateDocumentsInCluster())) self.assertEqual([self.cluster1.clusterId, self.cluster2.clusterId], mergedCluster.mergedClustersList) def test_getClusterInMapFormat(self): mergedCluster = StreamCluster.getClusterObjectToMergeFrom(self.cluster1) mergedCluster.mergedClustersList = [self.cluster1.clusterId] mergedCluster.lastStreamAddedTime = test_time mapReresentation = {'clusterId': mergedCluster.clusterId, 'lastStreamAddedTime':getStringRepresentationForTweetTimestamp(mergedCluster.lastStreamAddedTime), 'mergedClustersList': [self.cluster1.clusterId], 'streams': [self.doc1.docId], 'dimensions': {'#tcot':2, 'dsf':2}} self.assertEqual(mapReresentation, TwitterCrowdsSpecificMethods.getClusterInMapFormat(mergedCluster)) def test_getClusterFromMapFormat(self): mapReresentation = {'clusterId': 1, 'mergedClustersList': [self.cluster1.clusterId], 'lastStreamAddedTime': getStringRepresentationForTweetTimestamp(test_time), 'streams': [self.doc1.docId], 'dimensions': {'#tcot':2, 'dsf':2}} cluster = TwitterCrowdsSpecificMethods.getClusterFromMapFormat(mapReresentation) self.assertEqual(1, cluster.clusterId) self.assertEqual([self.cluster1.clusterId], cluster.mergedClustersList) self.assertEqual([self.doc1.docId], cluster.documentsInCluster) self.assertEqual({'#tcot':2, 'dsf':2}, cluster) self.assertEqual(getStringRepresentationForTweetTimestamp(test_time), getStringRepresentationForTweetTimestamp(cluster.lastStreamAddedTime))
def test_getClusterObjectToMergeFrom(self): documentsInCluster=list(self.cluster1.iterateDocumentsInCluster()) mergedCluster = StreamCluster.getClusterObjectToMergeFrom(self.cluster1) self.assertEqual(test_time-timedelta(seconds=60), mergedCluster.lastStreamAddedTime) self.assertEqual(self.cluster1, mergedCluster) self.assertNotEqual(self.cluster1.clusterId, mergedCluster.clusterId) self.assertEqual(documentsInCluster, list(mergedCluster.iterateDocumentsInCluster())) self.assertEqual(self.cluster1.aggregateVector, mergedCluster.aggregateVector) self.assertEqual(self.cluster1.vectorWeights, mergedCluster.vectorWeights) self.assertEqual(self.cluster1.lastStreamAddedTime, mergedCluster.lastStreamAddedTime)
def setUp(self): self.m1 = Message(1, 'sdf', 'A project to cluster high-dimensional streams.', test_time - timedelta(seconds=60)) self.m1.vector = Vector({1: 2, 2: 4}) self.stream1 = Stream(1, self.m1) self.m2 = Message(2, 'sdf', 'A project to cluster high-dimensional streams.', test_time) self.m2.vector = Vector({2: 4}) self.stream2 = Stream(2, self.m2) self.m3 = Message(3, 'sdf', 'A project to cluster high-dimensional streams.', test_time + timedelta(seconds=60)) self.m3.vector = Vector({2: 4}) self.stream3 = Stream(3, self.m3) self.cluster1 = StreamCluster(self.stream1) self.cluster2 = StreamCluster(self.stream2) self.cluster3 = StreamCluster(self.stream3)
def test_mergeCluster_lastStreamAddedTime_lesser_than_original_cluster( self): mergedCluster = StreamCluster.getClusterObjectToMergeFrom( self.cluster3) mergedCluster.mergeCluster(self.cluster1) self.assertTrue(self.cluster1.lastStreamAddedTime < self.cluster3.lastStreamAddedTime) self.assertEqual(self.cluster3.lastStreamAddedTime, mergedCluster.lastStreamAddedTime) self.assertNotEqual(self.cluster1.lastStreamAddedTime, mergedCluster.lastStreamAddedTime)
class CrowdTests(unittest.TestCase): def setUp(self): self.m1 = Message(1, 'sdf', 'A project to cluster high-dimensional streams.', test_time-timedelta(seconds=60)) self.m1.vector=Vector({1:1.,2:3.}) self.stream = Stream(1, self.m1) self.cluster = StreamCluster(self.stream) self.crowd = Crowd(self.cluster, test_time) def test_intitialization(self): self.assertEqual(self.cluster.clusterId, self.crowd.crowdId) def test_append(self): self.crowd.append(self.cluster, test_time+timedelta(days=1)) self.assertEqual([GeneralMethods.getEpochFromDateTimeObject(test_time), GeneralMethods.getEpochFromDateTimeObject(test_time+timedelta(days=1))], sorted(self.crowd.clusters.keys())) self.assertEqual(StreamCluster, type(self.crowd.clusters[GeneralMethods.getEpochFromDateTimeObject(test_time)])) self.assertEqual(2, self.crowd.lifespan) self.assertEqual(getStringRepresentationForTweetTimestamp(test_time), getStringRepresentationForTweetTimestamp(self.crowd.startTime)) self.assertEqual(getStringRepresentationForTweetTimestamp(test_time+timedelta(days=1)), getStringRepresentationForTweetTimestamp(self.crowd.endTime)) def test_maxClusterSize(self): self.assertEqual(1, self.crowd.maxClusterSize) message2 = Message(4, 'sdf', 'A project to cluster high-dimensional streams.', test_time) message2.vector=Vector({2:4}) stream2 = Stream(4, message2) self.cluster.addDocument(stream2) self.assertEqual(2, self.crowd.maxClusterSize) def test_crowdSize(self): self.assertEqual(1, self.crowd.crowdSize) self.cluster.addDocument(Stream(2, self.m1));self.cluster.addDocument(Stream(3, self.m1)) self.assertEqual(3, self.crowd.crowdSize) cluster = StreamCluster(Stream(3, self.m1)) self.crowd.append(cluster, test_time+timedelta(days=2)) self.assertNotEqual(4, self.crowd.crowdSize) self.assertEqual(3, self.crowd.crowdSize)
def clusterFilteringMethod(hdStreamClusteringObject, currentMessageTime): for cluster in StreamCluster.getClustersByAttributeAndThreshold( hdStreamClusteringObject.clusters.values(), hdStreamClusteringObject. stream_settings['cluster_filter_attribute'], hdStreamClusteringObject. stream_settings['cluster_filter_threshold'], StreamCluster.BELOW_THRESHOLD): del hdStreamClusteringObject.clusters[cluster.clusterId] for cluster in StreamCluster.getClustersByAttributeAndThreshold( hdStreamClusteringObject.clusters.values(), 'lastStreamAddedTime', currentMessageTime - hdStreamClusteringObject. stream_settings['cluster_inactivity_time_in_seconds'], StreamCluster.BELOW_THRESHOLD): del hdStreamClusteringObject.clusters[cluster.clusterId] if hdStreamClusteringObject.combineClustersMethod != None: hdStreamClusteringObject.clusters = hdStreamClusteringObject.combineClustersMethod( hdStreamClusteringObject.clusters, **hdStreamClusteringObject.stream_settings) DataStreamMethods._resetClustersInSignatureTries( hdStreamClusteringObject, currentMessageTime)
def setUp(self): self.m1 = Message(1, 'sdf', 'A project to cluster high-dimensional streams.', test_time-timedelta(seconds=60)) self.m1.vector=Vector({1:2,2:4}) self.stream1 = Stream(1, self.m1) self.m2 = Message(2, 'sdf', 'A project to cluster high-dimensional streams.', test_time) self.m2.vector=Vector({2:4}) self.stream2 = Stream(2, self.m2) self.m3 = Message(3, 'sdf', 'A project to cluster high-dimensional streams.', test_time+timedelta(seconds=60)) self.m3.vector=Vector({2:4}) self.stream3 = Stream(3, self.m3) self.cluster1 = StreamCluster(self.stream1) self.cluster2 = StreamCluster(self.stream2) self.cluster3 = StreamCluster(self.stream3)
def test_getClusterObjectToMergeFrom(self): documentsInCluster = list(self.cluster1.iterateDocumentsInCluster()) mergedCluster = StreamCluster.getClusterObjectToMergeFrom( self.cluster1) self.assertEqual(test_time - timedelta(seconds=60), mergedCluster.lastStreamAddedTime) self.assertEqual(self.cluster1, mergedCluster) self.assertNotEqual(self.cluster1.clusterId, mergedCluster.clusterId) self.assertEqual(documentsInCluster, list(mergedCluster.iterateDocumentsInCluster())) self.assertEqual(self.cluster1.aggregateVector, mergedCluster.aggregateVector) self.assertEqual(self.cluster1.vectorWeights, mergedCluster.vectorWeights) self.assertEqual(self.cluster1.lastStreamAddedTime, mergedCluster.lastStreamAddedTime)
def test_mergeCluster_compare_vector_lastStreamAddedTime_more_than_original_cluster( self): mergedCluster = StreamCluster.getClusterObjectToMergeFrom( self.cluster1) mergedCluster.mergeCluster(self.cluster2) self.assertEqual([self.stream1, self.stream2], list(mergedCluster.iterateDocumentsInCluster())) meanVectorForAllDocuments = Vector.getMeanVector( [self.stream1, self.stream2]) self.assertEqual(meanVectorForAllDocuments, mergedCluster) self.assertEqual( [mergedCluster.docId, mergedCluster.docId], list(doc.clusterId for doc in mergedCluster.iterateDocumentsInCluster())) self.assertEqual(self.cluster2.lastStreamAddedTime, mergedCluster.lastStreamAddedTime)
def clusterAnalysisMethod(hdStreamClusteringObject, currentMessageTime): print currentMessageTime print '\n\n\nEntering:', currentMessageTime, len( hdStreamClusteringObject.phraseTextAndDimensionMap), len( hdStreamClusteringObject.phraseTextToPhraseObjectMap), len( hdStreamClusteringObject.clusters) for cluster, _ in sorted(StreamCluster.iterateByAttribute( hdStreamClusteringObject.clusters.values(), 'length'), key=itemgetter(1), reverse=True)[:1]: print cluster.clusterId, cluster.length, [ stream.docId for stream in cluster.iterateDocumentsInCluster() ][:5], cluster.getTopDimensions(numberOfFeatures=5) print 'Leaving: ', currentMessageTime, len( hdStreamClusteringObject.phraseTextAndDimensionMap), len( hdStreamClusteringObject.phraseTextToPhraseObjectMap), len( hdStreamClusteringObject.clusters)
def combineClusters(clusters, **twitter_stream_settings): def getHashtagSet(vector): return set([word for dimension in vector for word in dimension.split() if word.startswith('#')]) def getClusterInt(id): return int(id.split('_')[1]) mergedClustersMap = {} for cluster in [clusters[v] for v in sorted(clusters, key=getClusterInt)]: mergedClusterId = None for mergedCluster in mergedClustersMap.itervalues(): clusterHashtags, mergedClusterHashtags = getHashtagSet(cluster), getHashtagSet(mergedCluster) if len(clusterHashtags.union(mergedClusterHashtags)) and jaccard_distance(clusterHashtags, mergedClusterHashtags) <= 1-twitter_stream_settings['cluster_merging_jaccard_distance_threshold']: mergedCluster.mergeCluster(cluster), mergedCluster.mergedClustersList.append(cluster.clusterId) mergedClusterId = mergedCluster.clusterId break if mergedClusterId==None: mergedCluster = StreamCluster.getClusterObjectToMergeFrom(cluster) mergedCluster.mergedClustersList = [cluster.clusterId] mergedClustersMap[mergedCluster.clusterId]=mergedCluster return mergedClustersMap
def getClusterFromMapFormat(clusterMap): dummyMessage = Message(1, '', '', datetime.now()) dummyMessage.vector = Vector({}) dummyStream = Stream(1, dummyMessage) cluster = StreamCluster(dummyStream) cluster.clusterId = clusterMap['clusterId'] cluster.lastStreamAddedTime = getDateTimeObjectFromTweetTimestamp( clusterMap['lastStreamAddedTime']) cluster.mergedClustersList = clusterMap['mergedClustersList'] cluster.documentsInCluster = clusterMap['streams'] for k, v in clusterMap['dimensions'].iteritems(): cluster[k] = v return cluster
def setUp(self): self.m1 = Message(1, 'sdf', 'A project to cluster high-dimensional streams.', test_time-timedelta(seconds=60)) self.m1.vector=Vector({1:1.,2:3.}) self.stream = Stream(1, self.m1) self.cluster = StreamCluster(self.stream) self.crowd = Crowd(self.cluster, test_time)
def test_getClusterInMapFormat(self): mergedCluster = StreamCluster.getClusterObjectToMergeFrom(self.cluster1) mergedCluster.mergedClustersList = [self.cluster1.clusterId] mergedCluster.lastStreamAddedTime = test_time mapReresentation = {'clusterId': mergedCluster.clusterId, 'lastStreamAddedTime':getStringRepresentationForTweetTimestamp(mergedCluster.lastStreamAddedTime), 'mergedClustersList': [self.cluster1.clusterId], 'streams': [self.doc1.docId], 'dimensions': {'#tcot':2, 'dsf':2}} self.assertEqual(mapReresentation, TwitterCrowdsSpecificMethods.getClusterInMapFormat(mergedCluster))
def test_mergeCluster_lastStreamAddedTime_lesser_than_original_cluster(self): mergedCluster = StreamCluster.getClusterObjectToMergeFrom(self.cluster3) mergedCluster.mergeCluster(self.cluster1) self.assertTrue(self.cluster1.lastStreamAddedTime<self.cluster3.lastStreamAddedTime) self.assertEqual(self.cluster3.lastStreamAddedTime, mergedCluster.lastStreamAddedTime) self.assertNotEqual(self.cluster1.lastStreamAddedTime, mergedCluster.lastStreamAddedTime)
def writeClusters(hdStreamClusteringObject, currentMessageTime): print '\n\n\nEntering:', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters) iterationData = {'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime), 'clusters': map(TwitterCrowdsSpecificMethods.getClusterInMapFormat, [cluster for cluster, _ in sorted(StreamCluster.iterateByAttribute(hdStreamClusteringObject.clusters.values(), 'length'), key=itemgetter(1), reverse=True)]), 'settings': Settings.getSerialzedObject(hdStreamClusteringObject.stream_settings) } FileIO.writeToFileAsJson(iterationData, hdStreamClusteringObject.stream_settings['lsh_clusters_folder']+FileIO.getFileByDay(currentMessageTime)) print 'Leaving: ', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters)
class StreamClusterTests(unittest.TestCase): def setUp(self): self.m1 = Message(1, 'sdf', 'A project to cluster high-dimensional streams.', test_time - timedelta(seconds=60)) self.m1.vector = Vector({1: 2, 2: 4}) self.stream1 = Stream(1, self.m1) self.m2 = Message(2, 'sdf', 'A project to cluster high-dimensional streams.', test_time) self.m2.vector = Vector({2: 4}) self.stream2 = Stream(2, self.m2) self.m3 = Message(3, 'sdf', 'A project to cluster high-dimensional streams.', test_time + timedelta(seconds=60)) self.m3.vector = Vector({2: 4}) self.stream3 = Stream(3, self.m3) self.cluster1 = StreamCluster(self.stream1) self.cluster2 = StreamCluster(self.stream2) self.cluster3 = StreamCluster(self.stream3) def test_initialization(self): self.assertEqual(test_time - timedelta(seconds=60), self.cluster1.lastStreamAddedTime) self.assertEqual(test_time, self.cluster2.lastStreamAddedTime) def test_getClusterObjectToMergeFrom(self): documentsInCluster = list(self.cluster1.iterateDocumentsInCluster()) mergedCluster = StreamCluster.getClusterObjectToMergeFrom( self.cluster1) self.assertEqual(test_time - timedelta(seconds=60), mergedCluster.lastStreamAddedTime) self.assertEqual(self.cluster1, mergedCluster) self.assertNotEqual(self.cluster1.clusterId, mergedCluster.clusterId) self.assertEqual(documentsInCluster, list(mergedCluster.iterateDocumentsInCluster())) self.assertEqual(self.cluster1.aggregateVector, mergedCluster.aggregateVector) self.assertEqual(self.cluster1.vectorWeights, mergedCluster.vectorWeights) self.assertEqual(self.cluster1.lastStreamAddedTime, mergedCluster.lastStreamAddedTime) def test_mergeCluster_compare_vector_lastStreamAddedTime_more_than_original_cluster( self): mergedCluster = StreamCluster.getClusterObjectToMergeFrom( self.cluster1) mergedCluster.mergeCluster(self.cluster2) self.assertEqual([self.stream1, self.stream2], list(mergedCluster.iterateDocumentsInCluster())) meanVectorForAllDocuments = Vector.getMeanVector( [self.stream1, self.stream2]) self.assertEqual(meanVectorForAllDocuments, mergedCluster) self.assertEqual( [mergedCluster.docId, mergedCluster.docId], list(doc.clusterId for doc in mergedCluster.iterateDocumentsInCluster())) self.assertEqual(self.cluster2.lastStreamAddedTime, mergedCluster.lastStreamAddedTime) def test_mergeCluster_lastStreamAddedTime_lesser_than_original_cluster( self): mergedCluster = StreamCluster.getClusterObjectToMergeFrom( self.cluster3) mergedCluster.mergeCluster(self.cluster1) self.assertTrue(self.cluster1.lastStreamAddedTime < self.cluster3.lastStreamAddedTime) self.assertEqual(self.cluster3.lastStreamAddedTime, mergedCluster.lastStreamAddedTime) self.assertNotEqual(self.cluster1.lastStreamAddedTime, mergedCluster.lastStreamAddedTime) def test_addDocument(self): message1 = Message(3, 'sdf', 'A project to cluster high-dimensional streams.', test_time) message1.vector = Vector({3: 4}) stream1 = Stream(3, message1) message2 = Message(4, 'sdf', 'A project to cluster high-dimensional streams.', test_time) message2.vector = Vector({2: 4}) stream2 = Stream(4, message2) self.assertNotEqual(test_time, self.cluster1.lastStreamAddedTime) self.cluster1.addDocument(stream1) self.assertEqual(test_time, self.cluster1.lastStreamAddedTime) # Test if cluster id is set. self.assertEqual(self.cluster1.clusterId, stream1.clusterId) # Test that cluster mean is updated. self.assertEqual({1: 2 / 2., 2: 2., 3: 2.}, self.cluster1) # Test that cluster aggrefate is updated. self.assertEqual({1: 2, 2: 4, 3: 4}, self.cluster1.aggregateVector) # Test that document is added to cluster documents. self.assertEqual(stream1, self.cluster1.documentsInCluster[stream1.docId]) self.cluster1.addDocument(stream2) self.assertEqual(3, self.cluster1.vectorWeights) self.assertEqual({1: 2 / 3., 2: 8 / 3., 3: 4 / 3.}, self.cluster1) self.assertEqual({1: 2, 2: 8, 3: 4}, self.cluster1.aggregateVector) def test_clustersIteration(self): clusters = [self.cluster1, self.cluster2, self.cluster3] self.assertEqual([self.cluster1], [ cluster for cluster in StreamCluster.getClustersByAttributeAndThreshold( clusters, 'lastStreamAddedTime', test_time, StreamCluster.BELOW_THRESHOLD) ]) self.assertEqual([self.cluster1, self.cluster2], [ cluster for cluster in StreamCluster.getClustersByAttributeAndThreshold( clusters, 'lastStreamAddedTime', test_time + timedelta(seconds=60), StreamCluster.BELOW_THRESHOLD) ])
def clusterAnalysisMethod(hdStreamClusteringObject, currentMessageTime): print currentMessageTime print '\n\n\nEntering:', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters) for cluster, _ in sorted(StreamCluster.iterateByAttribute(hdStreamClusteringObject.clusters.values(), 'length'), key=itemgetter(1), reverse=True)[:1]: print cluster.clusterId, cluster.length, [stream.docId for stream in cluster.iterateDocumentsInCluster()][:5], cluster.getTopDimensions(numberOfFeatures=5) print 'Leaving: ', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters)
class StreamClusterTests(unittest.TestCase): def setUp(self): self.m1 = Message(1, 'sdf', 'A project to cluster high-dimensional streams.', test_time-timedelta(seconds=60)) self.m1.vector=Vector({1:2,2:4}) self.stream1 = Stream(1, self.m1) self.m2 = Message(2, 'sdf', 'A project to cluster high-dimensional streams.', test_time) self.m2.vector=Vector({2:4}) self.stream2 = Stream(2, self.m2) self.m3 = Message(3, 'sdf', 'A project to cluster high-dimensional streams.', test_time+timedelta(seconds=60)) self.m3.vector=Vector({2:4}) self.stream3 = Stream(3, self.m3) self.cluster1 = StreamCluster(self.stream1) self.cluster2 = StreamCluster(self.stream2) self.cluster3 = StreamCluster(self.stream3) def test_initialization(self): self.assertEqual(test_time-timedelta(seconds=60), self.cluster1.lastStreamAddedTime) self.assertEqual(test_time, self.cluster2.lastStreamAddedTime) def test_getClusterObjectToMergeFrom(self): documentsInCluster=list(self.cluster1.iterateDocumentsInCluster()) mergedCluster = StreamCluster.getClusterObjectToMergeFrom(self.cluster1) self.assertEqual(test_time-timedelta(seconds=60), mergedCluster.lastStreamAddedTime) self.assertEqual(self.cluster1, mergedCluster) self.assertNotEqual(self.cluster1.clusterId, mergedCluster.clusterId) self.assertEqual(documentsInCluster, list(mergedCluster.iterateDocumentsInCluster())) self.assertEqual(self.cluster1.aggregateVector, mergedCluster.aggregateVector) self.assertEqual(self.cluster1.vectorWeights, mergedCluster.vectorWeights) self.assertEqual(self.cluster1.lastStreamAddedTime, mergedCluster.lastStreamAddedTime) def test_mergeCluster_compare_vector_lastStreamAddedTime_more_than_original_cluster(self): mergedCluster = StreamCluster.getClusterObjectToMergeFrom(self.cluster1) mergedCluster.mergeCluster(self.cluster2) self.assertEqual([self.stream1, self.stream2], list(mergedCluster.iterateDocumentsInCluster())) meanVectorForAllDocuments = Vector.getMeanVector([self.stream1, self.stream2]) self.assertEqual(meanVectorForAllDocuments, mergedCluster) self.assertEqual([mergedCluster.docId, mergedCluster.docId], list(doc.clusterId for doc in mergedCluster.iterateDocumentsInCluster())) self.assertEqual(self.cluster2.lastStreamAddedTime, mergedCluster.lastStreamAddedTime) def test_mergeCluster_lastStreamAddedTime_lesser_than_original_cluster(self): mergedCluster = StreamCluster.getClusterObjectToMergeFrom(self.cluster3) mergedCluster.mergeCluster(self.cluster1) self.assertTrue(self.cluster1.lastStreamAddedTime<self.cluster3.lastStreamAddedTime) self.assertEqual(self.cluster3.lastStreamAddedTime, mergedCluster.lastStreamAddedTime) self.assertNotEqual(self.cluster1.lastStreamAddedTime, mergedCluster.lastStreamAddedTime) def test_addDocument(self): message1 = Message(3, 'sdf', 'A project to cluster high-dimensional streams.', test_time) message1.vector=Vector({3:4}) stream1 = Stream(3, message1) message2 = Message(4, 'sdf', 'A project to cluster high-dimensional streams.', test_time) message2.vector=Vector({2:4}) stream2 = Stream(4, message2) self.assertNotEqual(test_time, self.cluster1.lastStreamAddedTime) self.cluster1.addDocument(stream1) self.assertEqual(test_time, self.cluster1.lastStreamAddedTime) # Test if cluster id is set. self.assertEqual(self.cluster1.clusterId, stream1.clusterId) # Test that cluster mean is updated. self.assertEqual({1:2/2.,2:2.,3:2.}, self.cluster1) # Test that cluster aggrefate is updated. self.assertEqual({1:2,2:4,3:4}, self.cluster1.aggregateVector) # Test that document is added to cluster documents. self.assertEqual(stream1, self.cluster1.documentsInCluster[stream1.docId]) self.cluster1.addDocument(stream2) self.assertEqual(3, self.cluster1.vectorWeights) self.assertEqual({1:2/3.,2:8/3.,3:4/3.}, self.cluster1) self.assertEqual({1:2,2:8,3:4}, self.cluster1.aggregateVector) def test_clustersIteration(self): clusters = [self.cluster1, self.cluster2, self.cluster3] self.assertEqual([self.cluster1], [cluster for cluster in StreamCluster.getClustersByAttributeAndThreshold(clusters, 'lastStreamAddedTime', test_time ,StreamCluster.BELOW_THRESHOLD)] ) self.assertEqual([self.cluster1, self.cluster2], [cluster for cluster in StreamCluster.getClustersByAttributeAndThreshold(clusters, 'lastStreamAddedTime', test_time+timedelta(seconds=60) ,StreamCluster.BELOW_THRESHOLD)] )