def test_addDocument(self): message1 = Message(3, 'sdf', 'A project to cluster high-dimensional streams.', test_time) message1.vector = Vector({3: 4}) stream1 = Stream(3, message1) message2 = Message(4, 'sdf', 'A project to cluster high-dimensional streams.', test_time) message2.vector = Vector({2: 4}) stream2 = Stream(4, message2) self.assertNotEqual(test_time, self.cluster1.lastStreamAddedTime) self.cluster1.addDocument(stream1) self.assertEqual(test_time, self.cluster1.lastStreamAddedTime) # Test if cluster id is set. self.assertEqual(self.cluster1.clusterId, stream1.clusterId) # Test that cluster mean is updated. self.assertEqual({1: 2 / 2., 2: 2., 3: 2.}, self.cluster1) # Test that cluster aggrefate is updated. self.assertEqual({1: 2, 2: 4, 3: 4}, self.cluster1.aggregateVector) # Test that document is added to cluster documents. self.assertEqual(stream1, self.cluster1.documentsInCluster[stream1.docId]) self.cluster1.addDocument(stream2) self.assertEqual(3, self.cluster1.vectorWeights) self.assertEqual({1: 2 / 3., 2: 8 / 3., 3: 4 / 3.}, self.cluster1) self.assertEqual({1: 2, 2: 8, 3: 4}, self.cluster1.aggregateVector)
def setUp(self): self.message = Message( 1, 'sdf', 'A project to cluster high-dimensional streams.', datetime.now()) self.message.vector = Vector({1: 2., 2: 3.}) self.s1 = Stream(1, self.message) self.v1 = Vector({1: 2., 3: 3.})
def mapper(self, _, value): if False: yield # I'm a generator! [(id0, vec0), (id1, vec1)] = value vec0, vec1 = Vector(vec0), Vector(vec1) if vec0.cosineSimilarity(vec1) >= self.ssa_threshold: self.streamIdToSimilarStreamsMap[id0].add( id1 ) if id0 < id1 else self.streamIdToSimilarStreamsMap[id1].add(id0)
def setUp(self): Cluster.clusterIdCounter = 0 self.docx = Document(1, {1: 2, 2: 4}) self.docy = Document(2, {2: 4}) self.cluster1 = Cluster(self.docx) self.cluster2 = Cluster(self.docy) self.doc1 = Document(3, Vector({3: 4})) self.doc2 = Document(4, Vector({2: 4}))
def setUp(self): self.m1 = Message(1, 'sdf', 'A project to cluster high-dimensional streams.', test_time - timedelta(seconds=60)) self.m1.vector = Vector({1: 1., 2: 3.}) self.stream = Stream(1, self.m1) self.m2 = Message(1, 'sdf', 'A project to cluster high-dimensional streams.', test_time) self.m2.vector = Vector({2: 3.})
def cluster(self, dataIterator): i = 1 for data in dataIterator: message = self.convertDataToMessageMethod(data, **self.stream_settings) # message = data if DataStreamMethods.messageInOrder(message.timeStamp): UtilityMethods.updatePhraseTextToPhraseObject( message.vector, message.timeStamp, self.phraseTextToPhraseObjectMap, **self.stream_settings) if message.streamId not in self.streamIdToStreamObjectMap: self.streamIdToStreamObjectMap[message.streamId] = Stream( message.streamId, message) self.getClusterAndUpdateExistingClusters( self.streamIdToStreamObjectMap[message.streamId]) else: previousStreamObject = Vector( vectorInitialValues=self.streamIdToStreamObjectMap[ message.streamId]) self.streamIdToStreamObjectMap[ message.streamId].updateForMessage( message, VectorUpdateMethods.exponentialDecay, **self.stream_settings) streamObject = self.streamIdToStreamObjectMap[ message.streamId] distance = Vector.euclideanDistance( streamObject, previousStreamObject) if distance > 10: # print i, len(self.clusters), distance self.getClusterAndUpdateExistingClusters( self.streamIdToStreamObjectMap[message.streamId]) self.updateDimensionsMethod.call( message.timeStamp, hdStreamClusteringObject=self, currentMessageTime=message.timeStamp) self.clusterFilteringMethod.call( message.timeStamp, hdStreamClusteringObject=self, currentMessageTime=message.timeStamp) # self.clusterAnalysisMethod.call(message.timeStamp, hdStreamClusteringObject=self, currentMessageTime=message.timeStamp) self.clusterAnalysisMethod.call( time.time(), hdStreamClusteringObject=self, currentMessageTime=message.timeStamp, numberOfMessages=i) # print i, len(self.clusters) i += 1
def cluster(self, dataIterator): i = 1 for data in dataIterator: message = self.convertDataToMessageMethod(data, **self.stream_settings) # message = data if DataStreamMethods.messageInOrder(message.timeStamp): UtilityMethods.updatePhraseTextToPhraseObject(message.vector, message.timeStamp, self.phraseTextToPhraseObjectMap, **self.stream_settings) if message.streamId not in self.streamIdToStreamObjectMap: self.streamIdToStreamObjectMap[message.streamId] = Stream(message.streamId, message) self.getClusterAndUpdateExistingClusters(self.streamIdToStreamObjectMap[message.streamId]) else: previousStreamObject=Vector(vectorInitialValues=self.streamIdToStreamObjectMap[message.streamId]) self.streamIdToStreamObjectMap[message.streamId].updateForMessage(message, VectorUpdateMethods.exponentialDecay, **self.stream_settings ) streamObject=self.streamIdToStreamObjectMap[message.streamId] distance = Vector.euclideanDistance(streamObject, previousStreamObject) if distance>10: # print i, len(self.clusters), distance self.getClusterAndUpdateExistingClusters(self.streamIdToStreamObjectMap[message.streamId]) self.updateDimensionsMethod.call(message.timeStamp, hdStreamClusteringObject=self, currentMessageTime=message.timeStamp) self.clusterFilteringMethod.call(message.timeStamp, hdStreamClusteringObject=self, currentMessageTime=message.timeStamp) # self.clusterAnalysisMethod.call(message.timeStamp, hdStreamClusteringObject=self, currentMessageTime=message.timeStamp) self.clusterAnalysisMethod.call(time.time(), hdStreamClusteringObject=self, currentMessageTime=message.timeStamp, numberOfMessages=i) # print i, len(self.clusters) i+=1
def setUp(self): self.phraseVector = { 'project': 1, 'cluster': 1, 'highdimensional': 1, 'streams': 1 } self.phraseTextAndDimensionMap = TwoWayMap() self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'project', 0) self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 1) self.phraseTextToPhraseObjectMap = { 'project': Phrase('project', test_time, score=8), 'cluster': Phrase('cluster', test_time, score=8), 'abcd': Phrase( 'abcd', test_time - 3 * stream_settings['max_phrase_inactivity_time_in_seconds'], score=8) } self.vector = Vector({0: 1, 1: 1, 2: 1, 3: 1}) self.initial_max_dimensions = stream_settings['dimensions'] stream_settings['dimensions'] = 2
def _getVectorMappedToDimension(self, vector, phraseTextAndDimensionMap): mappedVector = Vector() phraseToDimensionMap = phraseTextAndDimensionMap.getMap( TwoWayMap.MAP_FORWARD) for phrase in self: if phrase in phraseToDimensionMap: mappedVector[phraseToDimensionMap[phrase]] = self[phrase] return mappedVector
def test_mergeCluster_compare_vector_lastStreamAddedTime_more_than_original_cluster(self): mergedCluster = StreamCluster.getClusterObjectToMergeFrom(self.cluster1) mergedCluster.mergeCluster(self.cluster2) self.assertEqual([self.stream1, self.stream2], list(mergedCluster.iterateDocumentsInCluster())) meanVectorForAllDocuments = Vector.getMeanVector([self.stream1, self.stream2]) self.assertEqual(meanVectorForAllDocuments, mergedCluster) self.assertEqual([mergedCluster.docId, mergedCluster.docId], list(doc.clusterId for doc in mergedCluster.iterateDocumentsInCluster())) self.assertEqual(self.cluster2.lastStreamAddedTime, mergedCluster.lastStreamAddedTime)
def getClusterObjectToMergeFrom(streamCluster): streamCluster.lastMessageTime = streamCluster.lastStreamAddedTime mergedCluster = StreamCluster(streamCluster, shouldUpdateDocumentId=False) mergedCluster.aggregateVector, mergedCluster.vectorWeights = Vector( {}), 0.0 StreamCluster.updateClusterAttributes(mergedCluster, streamCluster) return mergedCluster
def test_maxClusterSize(self): self.assertEqual(1, self.crowd.maxClusterSize) message2 = Message(4, 'sdf', 'A project to cluster high-dimensional streams.', test_time) message2.vector = Vector({2: 4}) stream2 = Stream(4, message2) self.cluster.addDocument(stream2) self.assertEqual(2, self.crowd.maxClusterSize)
def setUp(self): self.m1 = Message(1, 'sdf', 'A project to cluster high-dimensional streams.', test_time - timedelta(seconds=60)) self.m1.vector = Vector({1: 2, 2: 4}) self.stream1 = Stream(1, self.m1) self.m2 = Message(2, 'sdf', 'A project to cluster high-dimensional streams.', test_time) self.m2.vector = Vector({2: 4}) self.stream2 = Stream(2, self.m2) self.m3 = Message(3, 'sdf', 'A project to cluster high-dimensional streams.', test_time + timedelta(seconds=60)) self.m3.vector = Vector({2: 4}) self.stream3 = Stream(3, self.m3) self.cluster1 = StreamCluster(self.stream1) self.cluster2 = StreamCluster(self.stream2) self.cluster3 = StreamCluster(self.stream3)
def createDocumentFromLine(docId, line): vector = Vector() words = line.split() for word in words[1:]: if word not in wordToDimensionMap: wordToDimensionMap[word] = len(wordToDimensionMap) wordDimension = wordToDimensionMap[word] if wordDimension not in vector: vector[wordDimension] = 1 else: vector[wordDimension] += 1 return Document(docId, vector, clusterId=words[0])
def convertTweetJSONToMessage(tweet, **twitter_stream_settings): tweetTime = getDateTimeObjectFromTweetTimestamp(tweet['created_at']) message = Message(tweet['user']['screen_name'], tweet['id'], tweet['text'], tweetTime) message.vector = Vector() for phrase in getPhrases(getWordsFromRawEnglishMessage(tweet['text']), twitter_stream_settings['min_phrase_length'], twitter_stream_settings['max_phrase_length']): if phrase not in message.vector: message.vector[phrase] = 0 message.vector[phrase] += 1 return message
def getClusterFromMapFormat(clusterMap): dummyMessage = Message(1, '', '', datetime.now()) dummyMessage.vector = Vector({}) dummyStream = Stream(1, dummyMessage) cluster = StreamCluster(dummyStream) cluster.clusterId = clusterMap['clusterId'] cluster.lastStreamAddedTime = getDateTimeObjectFromTweetTimestamp( clusterMap['lastStreamAddedTime']) cluster.mergedClustersList = clusterMap['mergedClustersList'] cluster.documentsInCluster = clusterMap['streams'] for k, v in clusterMap['dimensions'].iteritems(): cluster[k] = v return cluster
def plot_points(self, fr=-5.0, to=5.0, values=50, name='cubic'): vs = [] points = np.linspace(fr, to, values, endpoint=True) # default cubic function # f = self.cubic # check if given name exists and if so, let f be function if name in self.EQUATIONS: f = getattr(Function, name) # returns the function for i in points: vs.append(Vector([i, f(0, i)])) return vs
def test_setSignatureUsingVectors(self): phraseTextAndDimensionMap = TwoWayMap() phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'a', 1) phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'b', 2) documentWithDimensionsInVector = Document(1, {'a': 1, 'b': 4}) documentWithDimensionsNotInVector = Document(1, {'a': 1, 'c': 4}) vectors = [ Vector({ 1: 3 / 5., 2: -4 / 5. }), Vector({ 1: -5 / 13., 2: 12 / 13. }) ] documentWithDimensionsInVector.setSignatureUsingVectors( vectors, phraseTextAndDimensionMap) documentWithDimensionsNotInVector.setSignatureUsingVectors( vectors, phraseTextAndDimensionMap) self.assertEqual(Signature('01'), documentWithDimensionsInVector.signature) self.assertEqual(Signature('10'), documentWithDimensionsNotInVector.signature)
def iterateUserDocuments(fileName): dataForAggregation = defaultdict(Vector) textToIdMap = defaultdict(int) for tweet in FileIO.iterateJsonFromFile(fileName): textVector = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage( tweet, **default_experts_twitter_stream_settings).vector textIdVector = Vector() for phrase in textVector: if phrase not in textToIdMap: textToIdMap[phrase] = str(len(textToIdMap)) textIdVector[textToIdMap[phrase]] = textVector[phrase] dataForAggregation[tweet['user'] ['screen_name'].lower()] += textIdVector for k, v in dataForAggregation.iteritems(): yield k, v
def iterateTweetUsersAfterCombiningTweets(fileName, **stream_settings): dataForAggregation = defaultdict(Vector) textToIdMap = defaultdict(int) for tweet in TweetFiles.iterateTweetsFromGzip(fileName): textVector = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage( tweet, **stream_settings).vector textIdVector = Vector() for phrase in textVector: if phrase not in textToIdMap: textToIdMap[phrase] = str(len(textToIdMap)) textIdVector[textToIdMap[phrase]] = textVector[phrase] dataForAggregation[tweet['user'] ['screen_name'].lower()] += textIdVector for k, v in dataForAggregation.iteritems(): yield k, v
def test_mergeCluster_compare_vector_lastStreamAddedTime_more_than_original_cluster( self): mergedCluster = StreamCluster.getClusterObjectToMergeFrom( self.cluster1) mergedCluster.mergeCluster(self.cluster2) self.assertEqual([self.stream1, self.stream2], list(mergedCluster.iterateDocumentsInCluster())) meanVectorForAllDocuments = Vector.getMeanVector( [self.stream1, self.stream2]) self.assertEqual(meanVectorForAllDocuments, mergedCluster) self.assertEqual( [mergedCluster.docId, mergedCluster.docId], list(doc.clusterId for doc in mergedCluster.iterateDocumentsInCluster())) self.assertEqual(self.cluster2.lastStreamAddedTime, mergedCluster.lastStreamAddedTime)
def setUp(self): self.tweet = {'user':{'screen_name': 'abc'}, 'id':10, 'text':'A project to cluster high-dimensional streams.', 'created_at': 'Tue Mar 01 05:59:59 +0000 2011'} m1 = Message(1, '', '', datetime.now()) m1.vector=Vector({'#tcot':2,'dsf':4}) self.cluster1 = StreamCluster(Stream(1, m1)) m2 = Message(2, '', '', datetime.now()) m2.vector=Vector({'#tcot':4}) self.cluster2 = StreamCluster(Stream(2, m2)) m3 = Message(3, '', '', datetime.now()) m3.vector=Vector(Vector({'#tcot':2})) m4 = Message(4, '', '', datetime.now()) m4.vector=Vector(Vector({'#tcot':2})) self.doc1 = Stream(1, m3) self.doc2 = Stream(2, m4) self.meanVectorForAllDocuments = Vector.getMeanVector([self.cluster1, self.cluster2, self.doc1, self.doc2]) self.cluster1.addDocument(self.doc1) self.cluster2.addDocument(self.doc2)
def test_exponentialDecay(self): VectorUpdateMethods.exponentialDecay(self.s1, self.v1, 0.5, 1) self.assertEqual(Vector({1: 3, 2: 1.5, 3: 3}), self.s1)
def _getDocumentFromTuple((user, text)): vector, words = Vector(), text.split() for word in words[1:]: if word not in vector: vector[word] = 1 else: vector[word] += 1 return Document(user, vector)
def add_arrow_vector(self, vector, colour='k', from_vec=Vector([0, 0])): self.update_size_if_required(vector) arrow_buff = self.get_arrow_buffer() self.arrows.append([from_vec, vector, colour])
def test_addWithoutDecay(self): VectorUpdateMethods.addWithoutDecay(self.s1, self.v1) self.assertEqual(Vector({1: 4, 2: 3, 3: 3}), self.s1)
@author: kykamath ''' import sys, os, unittest, cjson sys.path.append('../../../') from library.vector import Vector from itertools import combinations from experiments.ssa.ssa_sim_mr import SSASimilarityMR from experiments.ssa.ssa import StreamSimilarityAggregationMR, ItemsClusterer,\ SimilarStreamAggregation test_file = 'ssa_test.dat.gz' test_ssa_threshold = 0.75 vectors = { '1': Vector({'1':4, '2':8}), '2': Vector({'1':4, '2':8}), '3': Vector({'1':4, '2':8}), '4': Vector({'2':8}), '5': Vector({'3':4, '4':8}), '6': Vector({'4':8}), '7': Vector({'3':4, '4':8}), '8': Vector({'3':4}) } def createTestFile(): with open(test_file, 'w') as f: for v1, v2 in combinations(vectors.iteritems(),2): f.write('%s\t%s\n'%(cjson.encode(['x']), cjson.encode([(v1[0], v1[1]), (v2[0], v2[1])]))) class ItemsClustererTests(unittest.TestCase): def setUp(self): self.clusterer = ItemsClusterer() def test_addNewCluster(self):
def topDimensions(self, numberOfDimensions=10): return Vector.getMeanVector( self.clusters.itervalues()).getTopDimensions( numberOfFeatures=numberOfDimensions)
def createDocumentFromLine(docId, line): vector, words = Vector(), line.split() for word in words[1:]: if word not in vector: vector[word] = 1 else: vector[word] += 1 return Document(words[0], vector)
def mapper(self, _, value): if False: yield # I'm a generator! [(id0, vec0), (id1, vec1)] = value vec0, vec1 = Vector(vec0), Vector(vec1) if vec0.cosineSimilarity(vec1)>=self.ssa_threshold: self.streamIdToSimilarStreamsMap[id0].add(id1) if id0<id1 else self.streamIdToSimilarStreamsMap[id1].add(id0)
def topDimensions(self, numberOfDimensions=10): return Vector.getMeanVector(self.clusters.itervalues()).getTopDimensions(numberOfFeatures=numberOfDimensions)
def test_updateForMessage_exponentialDecay(self): self.stream.updateForMessage(self.m2, VectorUpdateMethods.exponentialDecay, **stream_settings) self.assertEqual(self.stream, Vector({1: 0.5, 2: 4.5}))
def test_updateForMessage_addWithoutDecay(self): self.stream.updateForMessage(self.m2, VectorUpdateMethods.addWithoutDecay, **stream_settings) self.assertEqual(self.stream, Vector({1: 1., 2: 6.}))
def meanClusteringDistance(clusterMeans): return np.mean([Vector.euclideanDistance(Vector(dict(c1)), Vector(dict(c2))) for c1, c2 in combinations(clusterMeans,2)]) def writeLocationToUserMap(place):
def offlineLSHClusteringDemo(): wordToDimensionMap = {} def createDocumentFromLine(docId, line): vector = Vector() words = line.split() for word in words[1:]: if word not in wordToDimensionMap: wordToDimensionMap[word] = len(wordToDimensionMap) wordDimension = wordToDimensionMap[word] if wordDimension not in vector: vector[wordDimension] = 1 else: vector[wordDimension] += 1 return Document(docId, vector, clusterId=words[0]) dimensions = 53 signatureLength = 13 numberOfPermutations = 5 unitVector = RandomGaussianUnitVector(dimensions=dimensions, mu=0, sigma=1) vectorPermutations = VectorPermutation.getPermutations( signatureLength, dimensions, unitVector) signaturePermutations = [ SignaturePermutationWithTrie(signatureLength) for i in range(numberOfPermutations) ] permutatedUnitVectors = [ unitVector.getPermutedVector(r) for r in vectorPermutations ] # Build LSH Model. # Read training documents. traningDocumentsMap = {} for docId, l in enumerate( FileIO.iterateLinesFromFile('../data/train_offline.dat')): traningDocumentsMap[docId] = createDocumentFromLine(docId, l) # Construct cluster vectors. clusterToDocumentsMap = defaultdict(list) for document in traningDocumentsMap.values(): clusterToDocumentsMap[document.clusterId].append(document) clusterMap = {} for k, v in clusterToDocumentsMap.iteritems(): clusterMap[k] = Document(docId=k, vector=Vector.getMeanVector(v), clusterId=k) # Create signatures and signaturePermutations for all the clusters. map( lambda document: document.setSignatureUsingVectors( permutatedUnitVectors), clusterMap.values()) for permutation in signaturePermutations: for document in clusterMap.values(): permutation.addDocument(document) # Testing the model. # Read testing documents. testDocumentsMap = {} for docId, l in enumerate( FileIO.iterateLinesFromFile('../data/test_offline.dat')): testDocumentsMap[docId] = createDocumentFromLine(docId, l) # Create signatures for test documents map( lambda document: document.setSignatureUsingVectors( permutatedUnitVectors), testDocumentsMap.values()) predicted, labels = [], [] for t in testDocumentsMap.values(): possibleNearestClusters = reduce( lambda x, y: x.union(y), (permutation.getNearestDocuments(t) for permutation in signaturePermutations), set()) predictedClass = max( ((clusterId, clusterMap[clusterId].cosineSimilarity(t)) for clusterId in possibleNearestClusters), key=itemgetter(1)) predicted.append(predictedClass[0]) labels.append(t.clusterId) return EvaluationMetrics.purity(predicted, labels)