def test_mergeCluster_compare_vector_lastStreamAddedTime_more_than_original_cluster(self): mergedCluster = StreamCluster.getClusterObjectToMergeFrom(self.cluster1) mergedCluster.mergeCluster(self.cluster2) self.assertEqual([self.stream1, self.stream2], list(mergedCluster.iterateDocumentsInCluster())) meanVectorForAllDocuments = Vector.getMeanVector([self.stream1, self.stream2]) self.assertEqual(meanVectorForAllDocuments, mergedCluster) self.assertEqual([mergedCluster.docId, mergedCluster.docId], list(doc.clusterId for doc in mergedCluster.iterateDocumentsInCluster())) self.assertEqual(self.cluster2.lastStreamAddedTime, mergedCluster.lastStreamAddedTime)
def test_mergeCluster_compare_vector_lastStreamAddedTime_more_than_original_cluster( self): mergedCluster = StreamCluster.getClusterObjectToMergeFrom( self.cluster1) mergedCluster.mergeCluster(self.cluster2) self.assertEqual([self.stream1, self.stream2], list(mergedCluster.iterateDocumentsInCluster())) meanVectorForAllDocuments = Vector.getMeanVector( [self.stream1, self.stream2]) self.assertEqual(meanVectorForAllDocuments, mergedCluster) self.assertEqual( [mergedCluster.docId, mergedCluster.docId], list(doc.clusterId for doc in mergedCluster.iterateDocumentsInCluster())) self.assertEqual(self.cluster2.lastStreamAddedTime, mergedCluster.lastStreamAddedTime)
def setUp(self): self.tweet = {'user':{'screen_name': 'abc'}, 'id':10, 'text':'A project to cluster high-dimensional streams.', 'created_at': 'Tue Mar 01 05:59:59 +0000 2011'} m1 = Message(1, '', '', datetime.now()) m1.vector=Vector({'#tcot':2,'dsf':4}) self.cluster1 = StreamCluster(Stream(1, m1)) m2 = Message(2, '', '', datetime.now()) m2.vector=Vector({'#tcot':4}) self.cluster2 = StreamCluster(Stream(2, m2)) m3 = Message(3, '', '', datetime.now()) m3.vector=Vector(Vector({'#tcot':2})) m4 = Message(4, '', '', datetime.now()) m4.vector=Vector(Vector({'#tcot':2})) self.doc1 = Stream(1, m3) self.doc2 = Stream(2, m4) self.meanVectorForAllDocuments = Vector.getMeanVector([self.cluster1, self.cluster2, self.doc1, self.doc2]) self.cluster1.addDocument(self.doc1) self.cluster2.addDocument(self.doc2)
def topDimensions(self, numberOfDimensions=10): return Vector.getMeanVector( self.clusters.itervalues()).getTopDimensions( numberOfFeatures=numberOfDimensions)
def topDimensions(self, numberOfDimensions=10): return Vector.getMeanVector(self.clusters.itervalues()).getTopDimensions(numberOfFeatures=numberOfDimensions)
def offlineLSHClusteringDemo(): wordToDimensionMap = {} def createDocumentFromLine(docId, line): vector = Vector() words = line.split() for word in words[1:]: if word not in wordToDimensionMap: wordToDimensionMap[word] = len(wordToDimensionMap) wordDimension = wordToDimensionMap[word] if wordDimension not in vector: vector[wordDimension] = 1 else: vector[wordDimension] += 1 return Document(docId, vector, clusterId=words[0]) dimensions = 53 signatureLength = 13 numberOfPermutations = 5 unitVector = RandomGaussianUnitVector(dimensions=dimensions, mu=0, sigma=1) vectorPermutations = VectorPermutation.getPermutations( signatureLength, dimensions, unitVector) signaturePermutations = [ SignaturePermutationWithTrie(signatureLength) for i in range(numberOfPermutations) ] permutatedUnitVectors = [ unitVector.getPermutedVector(r) for r in vectorPermutations ] # Build LSH Model. # Read training documents. traningDocumentsMap = {} for docId, l in enumerate( FileIO.iterateLinesFromFile('../data/train_offline.dat')): traningDocumentsMap[docId] = createDocumentFromLine(docId, l) # Construct cluster vectors. clusterToDocumentsMap = defaultdict(list) for document in traningDocumentsMap.values(): clusterToDocumentsMap[document.clusterId].append(document) clusterMap = {} for k, v in clusterToDocumentsMap.iteritems(): clusterMap[k] = Document(docId=k, vector=Vector.getMeanVector(v), clusterId=k) # Create signatures and signaturePermutations for all the clusters. map( lambda document: document.setSignatureUsingVectors( permutatedUnitVectors), clusterMap.values()) for permutation in signaturePermutations: for document in clusterMap.values(): permutation.addDocument(document) # Testing the model. # Read testing documents. testDocumentsMap = {} for docId, l in enumerate( FileIO.iterateLinesFromFile('../data/test_offline.dat')): testDocumentsMap[docId] = createDocumentFromLine(docId, l) # Create signatures for test documents map( lambda document: document.setSignatureUsingVectors( permutatedUnitVectors), testDocumentsMap.values()) predicted, labels = [], [] for t in testDocumentsMap.values(): possibleNearestClusters = reduce( lambda x, y: x.union(y), (permutation.getNearestDocuments(t) for permutation in signaturePermutations), set()) predictedClass = max( ((clusterId, clusterMap[clusterId].cosineSimilarity(t)) for clusterId in possibleNearestClusters), key=itemgetter(1)) predicted.append(predictedClass[0]) labels.append(t.clusterId) return EvaluationMetrics.purity(predicted, labels)