Пример #1
0
 def setUp(self):
     self.dimension, self.signatureLength = 50, 23
     self.phraseTextAndDimensionMap = TwoWayMap()
     for i in range(self.dimension):
         self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, i, i)
     self.unitRandomVectors = [
         VectorGenerator.getRandomGaussianUnitVector(
             dimension=self.dimension, mu=0, sigma=1)
         for i in range(self.signatureLength)
     ]
     self.doc1 = Document(
         1,
         VectorGenerator.getRandomGaussianUnitVector(
             dimension=self.dimension, mu=0, sigma=1))
     self.doc2 = Document(
         2,
         VectorGenerator.getRandomGaussianUnitVector(
             dimension=self.dimension, mu=0, sigma=1))
     self.doc1.setSignatureUsingVectors(self.unitRandomVectors,
                                        self.phraseTextAndDimensionMap)
     self.doc2.setSignatureUsingVectors(self.unitRandomVectors,
                                        self.phraseTextAndDimensionMap)
     self.pm = SignaturePermutationWithTrie(
         signatureLength=self.signatureLength)
     self.pm.addDocument(self.doc1)
     self.pm.addDocument(self.doc2)
Пример #2
0
 def test_addDocument_newKey(self):
     doc1 = Document(
         1,
         VectorGenerator.getRandomGaussianUnitVector(
             dimension=self.dimension, mu=0, sigma=1))
     doc1.setSignatureUsingVectors(self.unitRandomVectors,
                                   self.phraseTextAndDimensionMap)
     pm = SignaturePermutationWithTrie(signatureLength=self.signatureLength)
     pm.addDocument(doc1)
     self.assertEqual(pm.signatureTrie[doc1.signature.permutate(pm).to01()],
                      set([1]))
    def __init__(self, **settings):
        self.settings = settings
        self.nearestNeighborThreshold = settings['nearest_neighbor_threshold']
        self.unitVector = RandomGaussianUnitVector(
            dimensions=settings['dimensions'], mu=0, sigma=1)
        self.vectorPermutations = VectorPermutation.getPermutations(
            settings['signature_length'], settings['dimensions'],
            self.unitVector)
        #        self.signaturePermutations = [SignaturePermutationWithTrie(settings['signature_length']) for i in range(settings['number_of_permutations'])]

        signatureType = settings.get('signature_type', 'signature_type_trie')
        if signatureType == 'signature_type_trie':
            self.signaturePermutations = [
                SignaturePermutationWithTrie(settings['signature_length'])
                for i in range(settings['number_of_permutations'])
            ]
        else:
            self.signaturePermutations = [
                SignaturePermutationWithSortedList(
                    settings['signature_length'])
                for i in range(settings['number_of_permutations'])
            ]

        self.phraseTextAndDimensionMap = TwoWayMap()
        self.documentIdToDocumentMap = {}
Пример #4
0
 def setUp(self):
     self.dimension, self.signatureLength = 50, 23
     self.phraseTextAndDimensionMap = TwoWayMap()
     for i in range(self.dimension): self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, i,i)
     self.unitRandomVectors = [VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1) for i in range(self.signatureLength)]
     self.doc1=Document(1, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1))
     self.doc2=Document(2, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1))
     self.doc1.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap); self.doc2.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap)
     self.pm = SignaturePermutationWithTrie(signatureLength=self.signatureLength)
     self.pm.addDocument(self.doc1)
     self.pm.addDocument(self.doc2)
Пример #5
0
 def __init__(self, **clustering_settings):
     self.thresholdForDocumentToBeInACluster = clustering_settings[
         'threshold_for_document_to_be_in_cluster']
     self.unitVector = RandomGaussianUnitVector(
         dimensions=clustering_settings['dimensions'], mu=0, sigma=1)
     self.vectorPermutations = VectorPermutation.getPermutations(
         clustering_settings['signature_length'],
         clustering_settings['dimensions'], self.unitVector)
     signatureType = clustering_settings.get('signature_type',
                                             'signature_type_trie')
     if signatureType == 'signature_type_trie':
         self.signaturePermutations = [
             SignaturePermutationWithTrie(
                 clustering_settings['signature_length'])
             for i in range(clustering_settings['number_of_permutations'])
         ]
     else:
         self.signaturePermutations = [
             SignaturePermutationWithSortedList(
                 clustering_settings['signature_length'])
             for i in range(clustering_settings['number_of_permutations'])
         ]
     self.phraseTextAndDimensionMap, self.clusters = TwoWayMap(), {}
     self.clustering_settings = clustering_settings
Пример #6
0
 def test_permutate(self):
     sgnt = Signature('1001011')
     self.assertTrue(sgnt.count() == sgnt.permutate(
         SignaturePermutationWithTrie(7)).count())
Пример #7
0
 def test_getNearestDocument_emptyTrie(self):
     permutationWithEmptyTrie = SignaturePermutationWithTrie(
         signatureLength=self.signatureLength)
     self.assertEqual(
         permutationWithEmptyTrie.getNearestDocuments(self.doc1), set())
Пример #8
0
class SignaturePermutationTests(unittest.TestCase):
    def setUp(self):
        self.dimension, self.signatureLength = 50, 23
        self.phraseTextAndDimensionMap = TwoWayMap()
        for i in range(self.dimension):
            self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, i, i)
        self.unitRandomVectors = [
            VectorGenerator.getRandomGaussianUnitVector(
                dimension=self.dimension, mu=0, sigma=1)
            for i in range(self.signatureLength)
        ]
        self.doc1 = Document(
            1,
            VectorGenerator.getRandomGaussianUnitVector(
                dimension=self.dimension, mu=0, sigma=1))
        self.doc2 = Document(
            2,
            VectorGenerator.getRandomGaussianUnitVector(
                dimension=self.dimension, mu=0, sigma=1))
        self.doc1.setSignatureUsingVectors(self.unitRandomVectors,
                                           self.phraseTextAndDimensionMap)
        self.doc2.setSignatureUsingVectors(self.unitRandomVectors,
                                           self.phraseTextAndDimensionMap)
        self.pm = SignaturePermutationWithTrie(
            signatureLength=self.signatureLength)
        self.pm.addDocument(self.doc1)
        self.pm.addDocument(self.doc2)

    def test_addDocument_newKey(self):
        doc1 = Document(
            1,
            VectorGenerator.getRandomGaussianUnitVector(
                dimension=self.dimension, mu=0, sigma=1))
        doc1.setSignatureUsingVectors(self.unitRandomVectors,
                                      self.phraseTextAndDimensionMap)
        pm = SignaturePermutationWithTrie(signatureLength=self.signatureLength)
        pm.addDocument(doc1)
        self.assertEqual(pm.signatureTrie[doc1.signature.permutate(pm).to01()],
                         set([1]))

    def test_addDocument_existingKey(self):
        newDocModifiedWithExistingSignature = Document(
            3,
            VectorGenerator.getRandomGaussianUnitVector(
                dimension=self.dimension, mu=0, sigma=1))
        newDocModifiedWithExistingSignature.signature = Signature(
            self.doc1.signature.to01())
        self.pm.addDocument(newDocModifiedWithExistingSignature)
        self.assertEqual(
            self.pm.signatureTrie[self.doc1.signature.permutate(
                self.pm).to01()], set([1, 3]))

    def test_getNearestDocument_usingAKeyAlreadyInTrie(self):
        self.assertEqual(self.pm.getNearestDocuments(self.doc1), set([1]))

    def test_getNearestDocument_usingANearbyKeyInTrie(self):
        digitReplacement = {'0': '1', '1': '0'}
        newDocWithANearbySignature = Document(
            3,
            VectorGenerator.getRandomGaussianUnitVector(
                dimension=self.dimension, mu=0, sigma=1))
        exactSignature = self.doc1.signature.to01()
        newDocWithANearbySignature.signature = Signature(
            exactSignature[:-1] + digitReplacement[exactSignature[-1]])
        self.assertNotEquals(self.doc1.signature.to01(),
                             newDocWithANearbySignature.signature.to01())
        self.assertEqual(
            self.pm.getNearestDocuments(newDocWithANearbySignature), set([1])
        )  # This assertion can sometimes fail because of randomization. Run the tests again. It's OK!

    def test_getNearestDocument_emptyTrie(self):
        permutationWithEmptyTrie = SignaturePermutationWithTrie(
            signatureLength=self.signatureLength)
        self.assertEqual(
            permutationWithEmptyTrie.getNearestDocuments(self.doc1), set())

    def test_removeDocument_documents(self):
        newDocModifiedWithExistingSignature = Document(
            3,
            VectorGenerator.getRandomGaussianUnitVector(
                dimension=self.dimension, mu=0, sigma=1))
        newDocModifiedWithExistingSignature.signature = Signature(
            self.doc1.signature.to01())
        self.pm.addDocument(newDocModifiedWithExistingSignature)
        self.assertEqual(
            self.pm.signatureTrie[self.doc1.signature.permutate(
                self.pm).to01()], set([1, 3]))
        self.pm.removeDocument(newDocModifiedWithExistingSignature)
        self.assertEqual(
            self.pm.signatureTrie[self.doc1.signature.permutate(
                self.pm).to01()], set([1]))
        self.pm.removeDocument(self.doc1)
        self.assertEqual(
            None,
            self.pm.signatureTrie.get(
                self.doc1.signature.permutate(self.pm).to01()))

    def test_resetSignatureTrie(self):
        self.assertTrue(len(self.pm.signatureTrie) > 0)
        self.pm.resetSignatureDataStructure()
        self.assertTrue(len(self.pm.signatureTrie) == 0)
Пример #9
0
class SignaturePermutationTests(unittest.TestCase):
    def setUp(self):
        self.dimension, self.signatureLength = 50, 23
        self.phraseTextAndDimensionMap = TwoWayMap()
        for i in range(self.dimension): self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, i,i)
        self.unitRandomVectors = [VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1) for i in range(self.signatureLength)]
        self.doc1=Document(1, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1))
        self.doc2=Document(2, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1))
        self.doc1.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap); self.doc2.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap)
        self.pm = SignaturePermutationWithTrie(signatureLength=self.signatureLength)
        self.pm.addDocument(self.doc1)
        self.pm.addDocument(self.doc2)
    def test_addDocument_newKey(self):
        doc1=Document(1, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1))
        doc1.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap)
        pm = SignaturePermutationWithTrie(signatureLength=self.signatureLength)
        pm.addDocument(doc1)
        self.assertEqual(pm.signatureTrie[doc1.signature.permutate(pm).to01()], set([1]))
    def test_addDocument_existingKey(self):
        newDocModifiedWithExistingSignature = Document(3, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1))
        newDocModifiedWithExistingSignature.signature = Signature(self.doc1.signature.to01())
        self.pm.addDocument(newDocModifiedWithExistingSignature)
        self.assertEqual(self.pm.signatureTrie[self.doc1.signature.permutate(self.pm).to01()], set([1, 3]))
    def test_getNearestDocument_usingAKeyAlreadyInTrie(self): self.assertEqual(self.pm.getNearestDocuments(self.doc1), set([1]))
    def test_getNearestDocument_usingANearbyKeyInTrie(self):
        digitReplacement = {'0': '1', '1': '0'}
        newDocWithANearbySignature = Document(3, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1))
        exactSignature = self.doc1.signature.to01() 
        newDocWithANearbySignature.signature = Signature(exactSignature[:-1]+digitReplacement[exactSignature[-1]])
        self.assertNotEquals(self.doc1.signature.to01(), newDocWithANearbySignature.signature.to01())
        self.assertEqual(self.pm.getNearestDocuments(newDocWithANearbySignature), set([1])) # This assertion can sometimes fail because of randomization. Run the tests again. It's OK!
    def test_getNearestDocument_emptyTrie(self):
        permutationWithEmptyTrie = SignaturePermutationWithTrie(signatureLength=self.signatureLength)
        self.assertEqual(permutationWithEmptyTrie.getNearestDocuments(self.doc1), set())
    def test_removeDocument_documents(self):
        newDocModifiedWithExistingSignature = Document(3, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1))
        newDocModifiedWithExistingSignature.signature = Signature(self.doc1.signature.to01())
        self.pm.addDocument(newDocModifiedWithExistingSignature)
        self.assertEqual(self.pm.signatureTrie[self.doc1.signature.permutate(self.pm).to01()], set([1, 3]))
        self.pm.removeDocument(newDocModifiedWithExistingSignature)
        self.assertEqual(self.pm.signatureTrie[self.doc1.signature.permutate(self.pm).to01()], set([1]))
        self.pm.removeDocument(self.doc1)
        self.assertEqual(None, self.pm.signatureTrie.get(self.doc1.signature.permutate(self.pm).to01()))
    def test_resetSignatureTrie(self):
        self.assertTrue(len(self.pm.signatureTrie)>0)
        self.pm.resetSignatureDataStructure()
        self.assertTrue(len(self.pm.signatureTrie)==0)
Пример #10
0
 def test_getNearestDocument_emptyTrie(self):
     permutationWithEmptyTrie = SignaturePermutationWithTrie(signatureLength=self.signatureLength)
     self.assertEqual(permutationWithEmptyTrie.getNearestDocuments(self.doc1), set())
Пример #11
0
 def test_addDocument_newKey(self):
     doc1=Document(1, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1))
     doc1.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap)
     pm = SignaturePermutationWithTrie(signatureLength=self.signatureLength)
     pm.addDocument(doc1)
     self.assertEqual(pm.signatureTrie[doc1.signature.permutate(pm).to01()], set([1]))
Пример #12
0
def offlineLSHClusteringDemo():
    wordToDimensionMap = {}

    def createDocumentFromLine(docId, line):
        vector = Vector()
        words = line.split()
        for word in words[1:]:
            if word not in wordToDimensionMap:
                wordToDimensionMap[word] = len(wordToDimensionMap)
            wordDimension = wordToDimensionMap[word]
            if wordDimension not in vector: vector[wordDimension] = 1
            else: vector[wordDimension] += 1
        return Document(docId, vector, clusterId=words[0])

    dimensions = 53
    signatureLength = 13
    numberOfPermutations = 5

    unitVector = RandomGaussianUnitVector(dimensions=dimensions, mu=0, sigma=1)
    vectorPermutations = VectorPermutation.getPermutations(
        signatureLength, dimensions, unitVector)
    signaturePermutations = [
        SignaturePermutationWithTrie(signatureLength)
        for i in range(numberOfPermutations)
    ]

    permutatedUnitVectors = [
        unitVector.getPermutedVector(r) for r in vectorPermutations
    ]

    # Build LSH Model.
    # Read training documents.
    traningDocumentsMap = {}
    for docId, l in enumerate(
            FileIO.iterateLinesFromFile('../data/train_offline.dat')):
        traningDocumentsMap[docId] = createDocumentFromLine(docId, l)
    # Construct cluster vectors.
    clusterToDocumentsMap = defaultdict(list)
    for document in traningDocumentsMap.values():
        clusterToDocumentsMap[document.clusterId].append(document)
    clusterMap = {}
    for k, v in clusterToDocumentsMap.iteritems():
        clusterMap[k] = Document(docId=k,
                                 vector=Vector.getMeanVector(v),
                                 clusterId=k)

    # Create signatures and signaturePermutations for all the clusters.
    map(
        lambda document: document.setSignatureUsingVectors(
            permutatedUnitVectors), clusterMap.values())
    for permutation in signaturePermutations:
        for document in clusterMap.values():
            permutation.addDocument(document)

    # Testing the model.
    # Read testing documents.
    testDocumentsMap = {}
    for docId, l in enumerate(
            FileIO.iterateLinesFromFile('../data/test_offline.dat')):
        testDocumentsMap[docId] = createDocumentFromLine(docId, l)
    # Create signatures for test documents
    map(
        lambda document: document.setSignatureUsingVectors(
            permutatedUnitVectors), testDocumentsMap.values())

    predicted, labels = [], []
    for t in testDocumentsMap.values():
        possibleNearestClusters = reduce(
            lambda x, y: x.union(y),
            (permutation.getNearestDocuments(t)
             for permutation in signaturePermutations), set())
        predictedClass = max(
            ((clusterId, clusterMap[clusterId].cosineSimilarity(t))
             for clusterId in possibleNearestClusters),
            key=itemgetter(1))
        predicted.append(predictedClass[0])
        labels.append(t.clusterId)
    return EvaluationMetrics.purity(predicted, labels)