Пример #1
0
 def setUp(self):
     self.phraseVector = {
         'project': 1,
         'cluster': 1,
         'highdimensional': 1,
         'streams': 1
     }
     self.phraseTextAndDimensionMap = TwoWayMap()
     self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'project', 0)
     self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 1)
     self.phraseTextToPhraseObjectMap = {
         'project':
         Phrase('project', test_time, score=8),
         'cluster':
         Phrase('cluster', test_time, score=8),
         'abcd':
         Phrase(
             'abcd',
             test_time -
             3 * stream_settings['max_phrase_inactivity_time_in_seconds'],
             score=8)
     }
     self.vector = Vector({0: 1, 1: 1, 2: 1, 3: 1})
     self.initial_max_dimensions = stream_settings['dimensions']
     stream_settings['dimensions'] = 2
Пример #2
0
 def setUp(self):
     self.dimension, self.signatureLength = 50, 23
     self.phraseTextAndDimensionMap = TwoWayMap()
     for i in range(self.dimension):
         self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, i, i)
     self.unitRandomVectors = [
         VectorGenerator.getRandomGaussianUnitVector(
             dimension=self.dimension, mu=0, sigma=1)
         for i in range(self.signatureLength)
     ]
     self.doc1 = Document(
         1,
         VectorGenerator.getRandomGaussianUnitVector(
             dimension=self.dimension, mu=0, sigma=1))
     self.doc2 = Document(
         2,
         VectorGenerator.getRandomGaussianUnitVector(
             dimension=self.dimension, mu=0, sigma=1))
     self.doc1.setSignatureUsingVectors(self.unitRandomVectors,
                                        self.phraseTextAndDimensionMap)
     self.doc2.setSignatureUsingVectors(self.unitRandomVectors,
                                        self.phraseTextAndDimensionMap)
     self.pm = SignaturePermutationWithTrie(
         signatureLength=self.signatureLength)
     self.pm.addDocument(self.doc1)
     self.pm.addDocument(self.doc2)
Пример #3
0
 def test_setSignatureUsingVectors(self):
     phraseTextAndDimensionMap = TwoWayMap()
     phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'a', 1)
     phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'b', 2)
     documentWithDimensionsInVector = Document(1, {'a':1, 'b':4})
     documentWithDimensionsNotInVector = Document(1, {'a':1, 'c':4})
     vectors = [ Vector({1: 3/5., 2: -4/5.}), Vector({1:-5/13., 2: 12/13.})]
     documentWithDimensionsInVector.setSignatureUsingVectors(vectors, phraseTextAndDimensionMap)
     documentWithDimensionsNotInVector.setSignatureUsingVectors(vectors, phraseTextAndDimensionMap)
     self.assertEqual(Signature('01'), documentWithDimensionsInVector.signature)
     self.assertEqual(Signature('10'), documentWithDimensionsNotInVector.signature)
Пример #4
0
 def setUp(self):
     self.phraseVector = {'project':1, 'cluster':1, 'highdimensional':1, 'streams':1}
     self.phraseTextAndDimensionMap = TwoWayMap()
     self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'project', 0)
     self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 1)
     self.finalPhraseToIdMap = {'project': 0, 'cluster': 1, 'streams': 2, 'highdimensional': 3}
     settings['dimensions'] = 2
Пример #5
0
class UtilityMethodsTests(unittest.TestCase):
    def setUp(self):
        self.phraseVector = {'project':1, 'cluster':1, 'highdimensional':1, 'streams':1}
        self.phraseTextAndDimensionMap = TwoWayMap()
        self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'project', 0)
        self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 1)
        self.finalPhraseToIdMap = {'project': 0, 'cluster': 1, 'streams': 2, 'highdimensional': 3}
        settings['dimensions'] = 2
    def test_updatePhraseTextAndDimensionsMap_PhraseMapHasLesserDimensions(self):
        settings['dimensions'] = 4
        UtilityMethods.updatePhraseTextAndDimensionsMap(self.phraseVector, self.phraseTextAndDimensionMap, **settings)
        self.assertEqual(self.finalPhraseToIdMap, self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD))
    def test_updatePhraseTextAndDimensionsMap_PhraseMapHasMaximumDimensions(self):
        UtilityMethods.updatePhraseTextAndDimensionsMap(self.phraseVector, self.phraseTextAndDimensionMap, **settings)
        for k in ['streams', 'highdimensional']: del self.finalPhraseToIdMap[k]
        self.assertEqual(self.finalPhraseToIdMap, self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD))
    def __init__(self, **settings):
        self.settings = settings
        self.nearestNeighborThreshold = settings['nearest_neighbor_threshold']
        self.unitVector = RandomGaussianUnitVector(
            dimensions=settings['dimensions'], mu=0, sigma=1)
        self.vectorPermutations = VectorPermutation.getPermutations(
            settings['signature_length'], settings['dimensions'],
            self.unitVector)
        #        self.signaturePermutations = [SignaturePermutationWithTrie(settings['signature_length']) for i in range(settings['number_of_permutations'])]

        signatureType = settings.get('signature_type', 'signature_type_trie')
        if signatureType == 'signature_type_trie':
            self.signaturePermutations = [
                SignaturePermutationWithTrie(settings['signature_length'])
                for i in range(settings['number_of_permutations'])
            ]
        else:
            self.signaturePermutations = [
                SignaturePermutationWithSortedList(
                    settings['signature_length'])
                for i in range(settings['number_of_permutations'])
            ]

        self.phraseTextAndDimensionMap = TwoWayMap()
        self.documentIdToDocumentMap = {}
Пример #7
0
class SignaturePermutationTests(unittest.TestCase):
    def setUp(self):
        self.dimension, self.signatureLength = 50, 23
        self.phraseTextAndDimensionMap = TwoWayMap()
        for i in range(self.dimension): self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, i,i)
        self.unitRandomVectors = [VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1) for i in range(self.signatureLength)]
        self.doc1=Document(1, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1))
        self.doc2=Document(2, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1))
        self.doc1.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap); self.doc2.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap)
        self.pm = SignaturePermutationWithTrie(signatureLength=self.signatureLength)
        self.pm.addDocument(self.doc1)
        self.pm.addDocument(self.doc2)
    def test_addDocument_newKey(self):
        doc1=Document(1, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1))
        doc1.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap)
        pm = SignaturePermutationWithTrie(signatureLength=self.signatureLength)
        pm.addDocument(doc1)
        self.assertEqual(pm.signatureTrie[doc1.signature.permutate(pm).to01()], set([1]))
    def test_addDocument_existingKey(self):
        newDocModifiedWithExistingSignature = Document(3, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1))
        newDocModifiedWithExistingSignature.signature = Signature(self.doc1.signature.to01())
        self.pm.addDocument(newDocModifiedWithExistingSignature)
        self.assertEqual(self.pm.signatureTrie[self.doc1.signature.permutate(self.pm).to01()], set([1, 3]))
    def test_getNearestDocument_usingAKeyAlreadyInTrie(self): self.assertEqual(self.pm.getNearestDocuments(self.doc1), set([1]))
    def test_getNearestDocument_usingANearbyKeyInTrie(self):
        digitReplacement = {'0': '1', '1': '0'}
        newDocWithANearbySignature = Document(3, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1))
        exactSignature = self.doc1.signature.to01() 
        newDocWithANearbySignature.signature = Signature(exactSignature[:-1]+digitReplacement[exactSignature[-1]])
        self.assertNotEquals(self.doc1.signature.to01(), newDocWithANearbySignature.signature.to01())
        self.assertEqual(self.pm.getNearestDocuments(newDocWithANearbySignature), set([1])) # This assertion can sometimes fail because of randomization. Run the tests again. It's OK!
    def test_getNearestDocument_emptyTrie(self):
        permutationWithEmptyTrie = SignaturePermutationWithTrie(signatureLength=self.signatureLength)
        self.assertEqual(permutationWithEmptyTrie.getNearestDocuments(self.doc1), set())
    def test_removeDocument_documents(self):
        newDocModifiedWithExistingSignature = Document(3, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1))
        newDocModifiedWithExistingSignature.signature = Signature(self.doc1.signature.to01())
        self.pm.addDocument(newDocModifiedWithExistingSignature)
        self.assertEqual(self.pm.signatureTrie[self.doc1.signature.permutate(self.pm).to01()], set([1, 3]))
        self.pm.removeDocument(newDocModifiedWithExistingSignature)
        self.assertEqual(self.pm.signatureTrie[self.doc1.signature.permutate(self.pm).to01()], set([1]))
        self.pm.removeDocument(self.doc1)
        self.assertEqual(None, self.pm.signatureTrie.get(self.doc1.signature.permutate(self.pm).to01()))
    def test_resetSignatureTrie(self):
        self.assertTrue(len(self.pm.signatureTrie)>0)
        self.pm.resetSignatureDataStructure()
        self.assertTrue(len(self.pm.signatureTrie)==0)
Пример #8
0
 def setUp(self):
     self.phraseVector = {
         'project': 1,
         'cluster': 1,
         'highdimensional': 1,
         'streams': 1
     }
     self.phraseTextAndDimensionMap = TwoWayMap()
     self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'project', 0)
     self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 1)
     self.finalPhraseToIdMap = {
         'project': 0,
         'cluster': 1,
         'streams': 2,
         'highdimensional': 3
     }
     settings['dimensions'] = 2
 def setUp(self):
     self.phraseVector = {'project':1, 'cluster':1, 'highdimensional':1, 'streams':1}
     self.phraseTextAndDimensionMap = TwoWayMap()
     self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'project', 0)
     self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 1)
     self.phraseTextToPhraseObjectMap = {'project': Phrase('project', test_time, score=8), 'cluster': Phrase('cluster', test_time, score=8), 'abcd': Phrase('abcd', test_time-3*stream_settings['max_phrase_inactivity_time_in_seconds'], score=8)}
     self.vector = Vector({0:1, 1:1, 2:1, 3:1})
     self.initial_max_dimensions = stream_settings['dimensions']
     stream_settings['dimensions'] = 2
Пример #10
0
 def setUp(self):
     self.dimension, self.signatureLength = 50, 23
     self.phraseTextAndDimensionMap = TwoWayMap()
     for i in range(self.dimension): self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, i,i)
     self.unitRandomVectors = [VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1) for i in range(self.signatureLength)]
     self.doc1=Document(1, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1))
     self.doc2=Document(2, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1))
     self.doc1.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap); self.doc2.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap)
     self.pm = SignaturePermutationWithTrie(signatureLength=self.signatureLength)
     self.pm.addDocument(self.doc1)
     self.pm.addDocument(self.doc2)
Пример #11
0
class UtilityMethodsTests(unittest.TestCase):
    def setUp(self):
        self.phraseVector = {
            'project': 1,
            'cluster': 1,
            'highdimensional': 1,
            'streams': 1
        }
        self.phraseTextAndDimensionMap = TwoWayMap()
        self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'project', 0)
        self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 1)
        self.finalPhraseToIdMap = {
            'project': 0,
            'cluster': 1,
            'streams': 2,
            'highdimensional': 3
        }
        settings['dimensions'] = 2

    def test_updatePhraseTextAndDimensionsMap_PhraseMapHasLesserDimensions(
            self):
        settings['dimensions'] = 4
        UtilityMethods.updatePhraseTextAndDimensionsMap(
            self.phraseVector, self.phraseTextAndDimensionMap, **settings)
        self.assertEqual(
            self.finalPhraseToIdMap,
            self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD))

    def test_updatePhraseTextAndDimensionsMap_PhraseMapHasMaximumDimensions(
            self):
        UtilityMethods.updatePhraseTextAndDimensionsMap(
            self.phraseVector, self.phraseTextAndDimensionMap, **settings)
        for k in ['streams', 'highdimensional']:
            del self.finalPhraseToIdMap[k]
        self.assertEqual(
            self.finalPhraseToIdMap,
            self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD))
Пример #12
0
 def test_setSignatureUsingVectors(self):
     phraseTextAndDimensionMap = TwoWayMap()
     phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'a', 1)
     phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'b', 2)
     documentWithDimensionsInVector = Document(1, {'a': 1, 'b': 4})
     documentWithDimensionsNotInVector = Document(1, {'a': 1, 'c': 4})
     vectors = [
         Vector({
             1: 3 / 5.,
             2: -4 / 5.
         }),
         Vector({
             1: -5 / 13.,
             2: 12 / 13.
         })
     ]
     documentWithDimensionsInVector.setSignatureUsingVectors(
         vectors, phraseTextAndDimensionMap)
     documentWithDimensionsNotInVector.setSignatureUsingVectors(
         vectors, phraseTextAndDimensionMap)
     self.assertEqual(Signature('01'),
                      documentWithDimensionsInVector.signature)
     self.assertEqual(Signature('10'),
                      documentWithDimensionsNotInVector.signature)
Пример #13
0
    def test_setSignatureUsingVectorPermutations(self):
        dimensions, signatureLength = 53, 13
        phraseTextAndDimensionMap = TwoWayMap()
        for i in range(dimensions):
            phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, i, i)
        phraseTextAndDimensionMapWithMissingDimensions = TwoWayMap()
        for i in range(dimensions - 50):
            phraseTextAndDimensionMapWithMissingDimensions.set(
                TwoWayMap.MAP_FORWARD, i, i)

        unitVector = RandomGaussianUnitVector(dimensions=dimensions,
                                              mu=0,
                                              sigma=1)
        vectorPermutations = VectorPermutation.getPermutations(
            signatureLength, dimensions, unitVector)
        permutatedUnitVectors = [
            unitVector.getPermutedVector(r) for r in vectorPermutations
        ]
        documentVector = VectorGenerator.getRandomGaussianUnitVector(
            dimension=dimensions, mu=0, sigma=1)
        documentWithSignatureByVectors = Document(1, documentVector)
        documentWithSignatureByVectorPermutations = Document(2, documentVector)
        documentWithSignatureByVectors.setSignatureUsingVectors(
            permutatedUnitVectors, phraseTextAndDimensionMap)
        documentWithSignatureByVectorPermutations.setSignatureUsingVectorPermutations(
            unitVector, vectorPermutations, phraseTextAndDimensionMap)
        self.assertEqual(documentWithSignatureByVectors.signature,
                         documentWithSignatureByVectorPermutations.signature)
        documentWithSignatureByVectors.setSignatureUsingVectors(
            permutatedUnitVectors,
            phraseTextAndDimensionMapWithMissingDimensions)
        documentWithSignatureByVectorPermutations.setSignatureUsingVectorPermutations(
            unitVector, vectorPermutations,
            phraseTextAndDimensionMapWithMissingDimensions)
        self.assertEqual(documentWithSignatureByVectors.signature,
                         documentWithSignatureByVectorPermutations.signature)
Пример #14
0
 def __init__(self, **clustering_settings):
     self.thresholdForDocumentToBeInACluster = clustering_settings[
         'threshold_for_document_to_be_in_cluster']
     self.unitVector = RandomGaussianUnitVector(
         dimensions=clustering_settings['dimensions'], mu=0, sigma=1)
     self.vectorPermutations = VectorPermutation.getPermutations(
         clustering_settings['signature_length'],
         clustering_settings['dimensions'], self.unitVector)
     signatureType = clustering_settings.get('signature_type',
                                             'signature_type_trie')
     if signatureType == 'signature_type_trie':
         self.signaturePermutations = [
             SignaturePermutationWithTrie(
                 clustering_settings['signature_length'])
             for i in range(clustering_settings['number_of_permutations'])
         ]
     else:
         self.signaturePermutations = [
             SignaturePermutationWithSortedList(
                 clustering_settings['signature_length'])
             for i in range(clustering_settings['number_of_permutations'])
         ]
     self.phraseTextAndDimensionMap, self.clusters = TwoWayMap(), {}
     self.clustering_settings = clustering_settings
Пример #15
0
 def test_setSignatureUsingVectorPermutations(self): 
     dimensions, signatureLength = 53, 13
     phraseTextAndDimensionMap = TwoWayMap()
     for i in range(dimensions): phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, i,i)
     phraseTextAndDimensionMapWithMissingDimensions = TwoWayMap()
     for i in range(dimensions-50): phraseTextAndDimensionMapWithMissingDimensions.set(TwoWayMap.MAP_FORWARD, i,i)
     
     unitVector = RandomGaussianUnitVector(dimensions=dimensions, mu=0, sigma=1)
     vectorPermutations = VectorPermutation.getPermutations(signatureLength, dimensions, unitVector)
     permutatedUnitVectors = [unitVector.getPermutedVector(r) for r in vectorPermutations]
     documentVector = VectorGenerator.getRandomGaussianUnitVector(dimension=dimensions, mu=0, sigma=1)
     documentWithSignatureByVectors=Document(1, documentVector)
     documentWithSignatureByVectorPermutations=Document(2, documentVector)
     documentWithSignatureByVectors.setSignatureUsingVectors(permutatedUnitVectors, phraseTextAndDimensionMap)
     documentWithSignatureByVectorPermutations.setSignatureUsingVectorPermutations(unitVector, vectorPermutations, phraseTextAndDimensionMap)
     self.assertEqual(documentWithSignatureByVectors.signature, documentWithSignatureByVectorPermutations.signature)
     documentWithSignatureByVectors.setSignatureUsingVectors(permutatedUnitVectors, phraseTextAndDimensionMapWithMissingDimensions)
     documentWithSignatureByVectorPermutations.setSignatureUsingVectorPermutations(unitVector, vectorPermutations, phraseTextAndDimensionMapWithMissingDimensions)
     self.assertEqual(documentWithSignatureByVectors.signature, documentWithSignatureByVectorPermutations.signature)
class UtilityMethodsTests(unittest.TestCase):
    def setUp(self):
        self.phraseVector = {'project':1, 'cluster':1, 'highdimensional':1, 'streams':1}
        self.phraseTextAndDimensionMap = TwoWayMap()
        self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'project', 0)
        self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 1)
        self.phraseTextToPhraseObjectMap = {'project': Phrase('project', test_time, score=8), 'cluster': Phrase('cluster', test_time, score=8), 'abcd': Phrase('abcd', test_time-3*stream_settings['max_phrase_inactivity_time_in_seconds'], score=8)}
        self.vector = Vector({0:1, 1:1, 2:1, 3:1})
        self.initial_max_dimensions = stream_settings['dimensions']
        stream_settings['dimensions'] = 2
    def tearDown(self): stream_settings['dimensions'] = self.initial_max_dimensions
    def test_updatedPhraseObject_PhraseObjectScoresAreUpdatedCorrectly(self): 
        UtilityMethods.updatePhraseTextToPhraseObject(self.phraseVector, test_time+timedelta(seconds=60), self.phraseTextToPhraseObjectMap, **stream_settings)
        self.assertEqual(5, len(self.phraseTextToPhraseObjectMap))
        self.assertEqual(5, self.phraseTextToPhraseObjectMap['project'].score)
        self.assertEqual(1, self.phraseTextToPhraseObjectMap['streams'].score)
    
    def test_updatedPhraseObject_phrase_does_not_exist_in_phraseToIdMap_but_exists_in_phraseTextToPhraseObjectMap_with_dimensions_full(self): 
        stream_settings['dimensions'] = 1
        self.phraseTextAndDimensionMap.remove(TwoWayMap.MAP_FORWARD, 'cluster')
        UtilityMethods.updatePhraseTextToPhraseObject(self.phraseVector, test_time+timedelta(seconds=60), self.phraseTextToPhraseObjectMap, **stream_settings)
        self.assertEqual({'project':0}, self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD))
        self.assertEqual(5, len(self.phraseTextToPhraseObjectMap))
        self.assertEqual(5, self.phraseTextToPhraseObjectMap['project'].score)
        self.assertEqual(5, self.phraseTextToPhraseObjectMap['cluster'].score)
        self.assertEqual(1, self.phraseTextToPhraseObjectMap['streams'].score)

    def test_createOrAddNewPhraseObject(self):
        UtilityMethods.createOrAddNewPhraseObject('new_phrase', self.phraseTextToPhraseObjectMap, test_time, **stream_settings)
        UtilityMethods.createOrAddNewPhraseObject('project', self.phraseTextToPhraseObjectMap, test_time, **stream_settings)
        self.assertEqual(4, len(self.phraseTextToPhraseObjectMap))
        self.assertEqual(1, self.phraseTextToPhraseObjectMap['new_phrase'].score)
        self.assertEqual(9, self.phraseTextToPhraseObjectMap['project'].score)
    
    def test_updateDimensions_when_phraseTextToIdMap_is_filled_to_max_dimensions(self):
        for phrase, score in zip(['added'], range(10,11)): self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase, test_time, score=score)
        UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap, self.phraseTextToPhraseObjectMap, test_time, **stream_settings)
        self.assertEqual({'project':0, 'added': 1}, self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD))
    
    def test_updateDimensions_when_phraseTextToIdMap_is_filled_to_max_dimensions_and_entire_map_is_changed(self):
        for phrase, score in zip(['added', 'are'], range(10,12)): self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase, test_time, score=score)
        UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap, self.phraseTextToPhraseObjectMap, test_time, **stream_settings)
        self.assertEqual({'added':1, 'are': 0}, self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD))
    
    def test_updateDimensions_when_phraseTextToIdMap_has_lesser_than_max_dimensions(self):
        stream_settings['dimensions'] = 4
        for phrase, score in zip(['new', 'phrases', 'are', 'added'], range(7,11)): self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase, test_time, score=score)
        UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap, self.phraseTextToPhraseObjectMap, test_time, **stream_settings)
        self.assertEqual(set({'project':0, 'phrases': 1, 'are':2, 'added':3}), set(self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD)))
        self.assertEqual(4, len(self.phraseTextAndDimensionMap))
    
    def test_updateDimensions_when_phrases_with_lower_id_are_removed_from_phraseTextToIdMap(self):
        stream_settings['dimensions'] = 3
        for phrase, score in zip(['new', 'phrases', 'are'], range(100,103)): self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase, test_time, score=score)
        self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 2)
        self.phraseTextToPhraseObjectMap['cluster'].score=100
        UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap, self.phraseTextToPhraseObjectMap, test_time, **stream_settings)
        self.assertEqual(range(3), sorted(self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD).values()))
    
    def test_updateDimensions_remove_old_phrases(self):
        originalTime=self.phraseTextToPhraseObjectMap['abcd'].latestOccuranceTime
        self.phraseTextToPhraseObjectMap['abcd'].latestOccuranceTime=test_time
        UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap, self.phraseTextToPhraseObjectMap, test_time, **stream_settings)
        self.assertTrue('abcd' in self.phraseTextToPhraseObjectMap)
        self.phraseTextToPhraseObjectMap['abcd'].latestOccuranceTime=originalTime
        UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap, self.phraseTextToPhraseObjectMap, test_time, **stream_settings)
        self.assertTrue('abcd' not in self.phraseTextToPhraseObjectMap)
    
    
    def test_updateDimensions_when_dimensions_have_to_be_removed(self):
        stream_settings['dimensions'] = 4
        self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'abcdx', 2)
        self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'abcdxy', 3)
        for phrase, score in zip(['new_text'], range(7,8)): self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase, test_time, score=score)
        self.phraseTextToPhraseObjectMap['cluster'].latestOccuranceTime=test_time-3*stream_settings['max_phrase_inactivity_time_in_seconds']
        UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap, self.phraseTextToPhraseObjectMap, test_time, **stream_settings)
        self.assertEqual(set({'project':0, 'new_text': 1}), set(self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD)))
    
    def test_checkCriticalErrorsInPhraseTextToIdMap_larger_than_expected_dimensions(self):
        self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'sdfsd', 3)
        print 'Ignore this message: ',
        self.assertRaises(SystemExit, UtilityMethods.checkCriticalErrorsInPhraseTextToIdMap, self.phraseTextAndDimensionMap, **stream_settings)
    
    def test_pruningConditionDeterministic(self):
        phrase1 = Phrase('dsf', test_time-3*stream_settings['max_phrase_inactivity_time_in_seconds'], 1)
        phrase2 = Phrase('dsf', test_time, 1)
        self.assertTrue(UtilityMethods.pruningConditionDeterministic(phrase1, test_time, **stream_settings))
        self.assertFalse(UtilityMethods.pruningConditionDeterministic(phrase2, test_time, **stream_settings))
    
    def test_pruningConditionRandom(self):
        phrase1 = Phrase('dsf', test_time-3*stream_settings['max_phrase_inactivity_time_in_seconds'], 1)
        phrase2 = Phrase('dsf', test_time, 1)
        self.assertTrue(UtilityMethods.pruningConditionRandom(phrase1, test_time, **stream_settings))
        self.assertFalse(UtilityMethods.pruningConditionRandom(phrase2, test_time, **stream_settings))
    
    def test_pruneUnnecessaryPhrases(self):
        phraseTextToPhraseObjectMap = {'dsf': Phrase('dsf', test_time-3*stream_settings['max_phrase_inactivity_time_in_seconds'], 1), 'abc': Phrase('abc', test_time, 1)}
        UtilityMethods.pruneUnnecessaryPhrases(phraseTextToPhraseObjectMap, test_time, UtilityMethods.pruningConditionRandom, **stream_settings)
        self.assertTrue('dsf' not in phraseTextToPhraseObjectMap)
        self.assertTrue('abc' in phraseTextToPhraseObjectMap)
Пример #17
0
class UtilityMethodsTests(unittest.TestCase):
    def setUp(self):
        self.phraseVector = {
            'project': 1,
            'cluster': 1,
            'highdimensional': 1,
            'streams': 1
        }
        self.phraseTextAndDimensionMap = TwoWayMap()
        self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'project', 0)
        self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 1)
        self.phraseTextToPhraseObjectMap = {
            'project':
            Phrase('project', test_time, score=8),
            'cluster':
            Phrase('cluster', test_time, score=8),
            'abcd':
            Phrase(
                'abcd',
                test_time -
                3 * stream_settings['max_phrase_inactivity_time_in_seconds'],
                score=8)
        }
        self.vector = Vector({0: 1, 1: 1, 2: 1, 3: 1})
        self.initial_max_dimensions = stream_settings['dimensions']
        stream_settings['dimensions'] = 2

    def tearDown(self):
        stream_settings['dimensions'] = self.initial_max_dimensions

    def test_updatedPhraseObject_PhraseObjectScoresAreUpdatedCorrectly(self):
        UtilityMethods.updatePhraseTextToPhraseObject(
            self.phraseVector, test_time + timedelta(seconds=60),
            self.phraseTextToPhraseObjectMap, **stream_settings)
        self.assertEqual(5, len(self.phraseTextToPhraseObjectMap))
        self.assertEqual(5, self.phraseTextToPhraseObjectMap['project'].score)
        self.assertEqual(1, self.phraseTextToPhraseObjectMap['streams'].score)

    def test_updatedPhraseObject_phrase_does_not_exist_in_phraseToIdMap_but_exists_in_phraseTextToPhraseObjectMap_with_dimensions_full(
            self):
        stream_settings['dimensions'] = 1
        self.phraseTextAndDimensionMap.remove(TwoWayMap.MAP_FORWARD, 'cluster')
        UtilityMethods.updatePhraseTextToPhraseObject(
            self.phraseVector, test_time + timedelta(seconds=60),
            self.phraseTextToPhraseObjectMap, **stream_settings)
        self.assertEqual({'project': 0},
                         self.phraseTextAndDimensionMap.getMap(
                             TwoWayMap.MAP_FORWARD))
        self.assertEqual(5, len(self.phraseTextToPhraseObjectMap))
        self.assertEqual(5, self.phraseTextToPhraseObjectMap['project'].score)
        self.assertEqual(5, self.phraseTextToPhraseObjectMap['cluster'].score)
        self.assertEqual(1, self.phraseTextToPhraseObjectMap['streams'].score)

    def test_createOrAddNewPhraseObject(self):
        UtilityMethods.createOrAddNewPhraseObject(
            'new_phrase', self.phraseTextToPhraseObjectMap, test_time,
            **stream_settings)
        UtilityMethods.createOrAddNewPhraseObject(
            'project', self.phraseTextToPhraseObjectMap, test_time,
            **stream_settings)
        self.assertEqual(4, len(self.phraseTextToPhraseObjectMap))
        self.assertEqual(1,
                         self.phraseTextToPhraseObjectMap['new_phrase'].score)
        self.assertEqual(9, self.phraseTextToPhraseObjectMap['project'].score)

    def test_updateDimensions_when_phraseTextToIdMap_is_filled_to_max_dimensions(
            self):
        for phrase, score in zip(['added'], range(10, 11)):
            self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase,
                                                              test_time,
                                                              score=score)
        UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap,
                                        self.phraseTextToPhraseObjectMap,
                                        test_time, **stream_settings)
        self.assertEqual({
            'project': 0,
            'added': 1
        }, self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD))

    def test_updateDimensions_when_phraseTextToIdMap_is_filled_to_max_dimensions_and_entire_map_is_changed(
            self):
        for phrase, score in zip(['added', 'are'], range(10, 12)):
            self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase,
                                                              test_time,
                                                              score=score)
        UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap,
                                        self.phraseTextToPhraseObjectMap,
                                        test_time, **stream_settings)
        self.assertEqual({
            'added': 1,
            'are': 0
        }, self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD))

    def test_updateDimensions_when_phraseTextToIdMap_has_lesser_than_max_dimensions(
            self):
        stream_settings['dimensions'] = 4
        for phrase, score in zip(['new', 'phrases', 'are', 'added'],
                                 range(7, 11)):
            self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase,
                                                              test_time,
                                                              score=score)
        UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap,
                                        self.phraseTextToPhraseObjectMap,
                                        test_time, **stream_settings)
        self.assertEqual(
            set({
                'project': 0,
                'phrases': 1,
                'are': 2,
                'added': 3
            }),
            set(self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD)))
        self.assertEqual(4, len(self.phraseTextAndDimensionMap))

    def test_updateDimensions_when_phrases_with_lower_id_are_removed_from_phraseTextToIdMap(
            self):
        stream_settings['dimensions'] = 3
        for phrase, score in zip(['new', 'phrases', 'are'], range(100, 103)):
            self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase,
                                                              test_time,
                                                              score=score)
        self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 2)
        self.phraseTextToPhraseObjectMap['cluster'].score = 100
        UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap,
                                        self.phraseTextToPhraseObjectMap,
                                        test_time, **stream_settings)
        self.assertEqual(
            range(3),
            sorted(
                self.phraseTextAndDimensionMap.getMap(
                    TwoWayMap.MAP_FORWARD).values()))

    def test_updateDimensions_remove_old_phrases(self):
        originalTime = self.phraseTextToPhraseObjectMap[
            'abcd'].latestOccuranceTime
        self.phraseTextToPhraseObjectMap[
            'abcd'].latestOccuranceTime = test_time
        UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap,
                                        self.phraseTextToPhraseObjectMap,
                                        test_time, **stream_settings)
        self.assertTrue('abcd' in self.phraseTextToPhraseObjectMap)
        self.phraseTextToPhraseObjectMap[
            'abcd'].latestOccuranceTime = originalTime
        UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap,
                                        self.phraseTextToPhraseObjectMap,
                                        test_time, **stream_settings)
        self.assertTrue('abcd' not in self.phraseTextToPhraseObjectMap)

    def test_updateDimensions_when_dimensions_have_to_be_removed(self):
        stream_settings['dimensions'] = 4
        self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'abcdx', 2)
        self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'abcdxy', 3)
        for phrase, score in zip(['new_text'], range(7, 8)):
            self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase,
                                                              test_time,
                                                              score=score)
        self.phraseTextToPhraseObjectMap[
            'cluster'].latestOccuranceTime = test_time - 3 * stream_settings[
                'max_phrase_inactivity_time_in_seconds']
        UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap,
                                        self.phraseTextToPhraseObjectMap,
                                        test_time, **stream_settings)
        self.assertEqual(
            set({
                'project': 0,
                'new_text': 1
            }),
            set(self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD)))

    def test_checkCriticalErrorsInPhraseTextToIdMap_larger_than_expected_dimensions(
            self):
        self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'sdfsd', 3)
        print 'Ignore this message: ',
        self.assertRaises(
            SystemExit, UtilityMethods.checkCriticalErrorsInPhraseTextToIdMap,
            self.phraseTextAndDimensionMap, **stream_settings)

    def test_pruningConditionDeterministic(self):
        phrase1 = Phrase(
            'dsf', test_time -
            3 * stream_settings['max_phrase_inactivity_time_in_seconds'], 1)
        phrase2 = Phrase('dsf', test_time, 1)
        self.assertTrue(
            UtilityMethods.pruningConditionDeterministic(
                phrase1, test_time, **stream_settings))
        self.assertFalse(
            UtilityMethods.pruningConditionDeterministic(
                phrase2, test_time, **stream_settings))

    def test_pruningConditionRandom(self):
        phrase1 = Phrase(
            'dsf', test_time -
            3 * stream_settings['max_phrase_inactivity_time_in_seconds'], 1)
        phrase2 = Phrase('dsf', test_time, 1)
        self.assertTrue(
            UtilityMethods.pruningConditionRandom(phrase1, test_time,
                                                  **stream_settings))
        self.assertFalse(
            UtilityMethods.pruningConditionRandom(phrase2, test_time,
                                                  **stream_settings))

    def test_pruneUnnecessaryPhrases(self):
        phraseTextToPhraseObjectMap = {
            'dsf':
            Phrase(
                'dsf', test_time -
                3 * stream_settings['max_phrase_inactivity_time_in_seconds'],
                1),
            'abc':
            Phrase('abc', test_time, 1)
        }
        UtilityMethods.pruneUnnecessaryPhrases(
            phraseTextToPhraseObjectMap, test_time,
            UtilityMethods.pruningConditionRandom, **stream_settings)
        self.assertTrue('dsf' not in phraseTextToPhraseObjectMap)
        self.assertTrue('abc' in phraseTextToPhraseObjectMap)
Пример #18
0
class SignaturePermutationTests(unittest.TestCase):
    def setUp(self):
        self.dimension, self.signatureLength = 50, 23
        self.phraseTextAndDimensionMap = TwoWayMap()
        for i in range(self.dimension):
            self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, i, i)
        self.unitRandomVectors = [
            VectorGenerator.getRandomGaussianUnitVector(
                dimension=self.dimension, mu=0, sigma=1)
            for i in range(self.signatureLength)
        ]
        self.doc1 = Document(
            1,
            VectorGenerator.getRandomGaussianUnitVector(
                dimension=self.dimension, mu=0, sigma=1))
        self.doc2 = Document(
            2,
            VectorGenerator.getRandomGaussianUnitVector(
                dimension=self.dimension, mu=0, sigma=1))
        self.doc1.setSignatureUsingVectors(self.unitRandomVectors,
                                           self.phraseTextAndDimensionMap)
        self.doc2.setSignatureUsingVectors(self.unitRandomVectors,
                                           self.phraseTextAndDimensionMap)
        self.pm = SignaturePermutationWithTrie(
            signatureLength=self.signatureLength)
        self.pm.addDocument(self.doc1)
        self.pm.addDocument(self.doc2)

    def test_addDocument_newKey(self):
        doc1 = Document(
            1,
            VectorGenerator.getRandomGaussianUnitVector(
                dimension=self.dimension, mu=0, sigma=1))
        doc1.setSignatureUsingVectors(self.unitRandomVectors,
                                      self.phraseTextAndDimensionMap)
        pm = SignaturePermutationWithTrie(signatureLength=self.signatureLength)
        pm.addDocument(doc1)
        self.assertEqual(pm.signatureTrie[doc1.signature.permutate(pm).to01()],
                         set([1]))

    def test_addDocument_existingKey(self):
        newDocModifiedWithExistingSignature = Document(
            3,
            VectorGenerator.getRandomGaussianUnitVector(
                dimension=self.dimension, mu=0, sigma=1))
        newDocModifiedWithExistingSignature.signature = Signature(
            self.doc1.signature.to01())
        self.pm.addDocument(newDocModifiedWithExistingSignature)
        self.assertEqual(
            self.pm.signatureTrie[self.doc1.signature.permutate(
                self.pm).to01()], set([1, 3]))

    def test_getNearestDocument_usingAKeyAlreadyInTrie(self):
        self.assertEqual(self.pm.getNearestDocuments(self.doc1), set([1]))

    def test_getNearestDocument_usingANearbyKeyInTrie(self):
        digitReplacement = {'0': '1', '1': '0'}
        newDocWithANearbySignature = Document(
            3,
            VectorGenerator.getRandomGaussianUnitVector(
                dimension=self.dimension, mu=0, sigma=1))
        exactSignature = self.doc1.signature.to01()
        newDocWithANearbySignature.signature = Signature(
            exactSignature[:-1] + digitReplacement[exactSignature[-1]])
        self.assertNotEquals(self.doc1.signature.to01(),
                             newDocWithANearbySignature.signature.to01())
        self.assertEqual(
            self.pm.getNearestDocuments(newDocWithANearbySignature), set([1])
        )  # This assertion can sometimes fail because of randomization. Run the tests again. It's OK!

    def test_getNearestDocument_emptyTrie(self):
        permutationWithEmptyTrie = SignaturePermutationWithTrie(
            signatureLength=self.signatureLength)
        self.assertEqual(
            permutationWithEmptyTrie.getNearestDocuments(self.doc1), set())

    def test_removeDocument_documents(self):
        newDocModifiedWithExistingSignature = Document(
            3,
            VectorGenerator.getRandomGaussianUnitVector(
                dimension=self.dimension, mu=0, sigma=1))
        newDocModifiedWithExistingSignature.signature = Signature(
            self.doc1.signature.to01())
        self.pm.addDocument(newDocModifiedWithExistingSignature)
        self.assertEqual(
            self.pm.signatureTrie[self.doc1.signature.permutate(
                self.pm).to01()], set([1, 3]))
        self.pm.removeDocument(newDocModifiedWithExistingSignature)
        self.assertEqual(
            self.pm.signatureTrie[self.doc1.signature.permutate(
                self.pm).to01()], set([1]))
        self.pm.removeDocument(self.doc1)
        self.assertEqual(
            None,
            self.pm.signatureTrie.get(
                self.doc1.signature.permutate(self.pm).to01()))

    def test_resetSignatureTrie(self):
        self.assertTrue(len(self.pm.signatureTrie) > 0)
        self.pm.resetSignatureDataStructure()
        self.assertTrue(len(self.pm.signatureTrie) == 0)