def setUp(self): self.phraseVector = { 'project': 1, 'cluster': 1, 'highdimensional': 1, 'streams': 1 } self.phraseTextAndDimensionMap = TwoWayMap() self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'project', 0) self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 1) self.phraseTextToPhraseObjectMap = { 'project': Phrase('project', test_time, score=8), 'cluster': Phrase('cluster', test_time, score=8), 'abcd': Phrase( 'abcd', test_time - 3 * stream_settings['max_phrase_inactivity_time_in_seconds'], score=8) } self.vector = Vector({0: 1, 1: 1, 2: 1, 3: 1}) self.initial_max_dimensions = stream_settings['dimensions'] stream_settings['dimensions'] = 2
def setUp(self): self.dimension, self.signatureLength = 50, 23 self.phraseTextAndDimensionMap = TwoWayMap() for i in range(self.dimension): self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, i, i) self.unitRandomVectors = [ VectorGenerator.getRandomGaussianUnitVector( dimension=self.dimension, mu=0, sigma=1) for i in range(self.signatureLength) ] self.doc1 = Document( 1, VectorGenerator.getRandomGaussianUnitVector( dimension=self.dimension, mu=0, sigma=1)) self.doc2 = Document( 2, VectorGenerator.getRandomGaussianUnitVector( dimension=self.dimension, mu=0, sigma=1)) self.doc1.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap) self.doc2.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap) self.pm = SignaturePermutationWithTrie( signatureLength=self.signatureLength) self.pm.addDocument(self.doc1) self.pm.addDocument(self.doc2)
def test_setSignatureUsingVectors(self): phraseTextAndDimensionMap = TwoWayMap() phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'a', 1) phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'b', 2) documentWithDimensionsInVector = Document(1, {'a':1, 'b':4}) documentWithDimensionsNotInVector = Document(1, {'a':1, 'c':4}) vectors = [ Vector({1: 3/5., 2: -4/5.}), Vector({1:-5/13., 2: 12/13.})] documentWithDimensionsInVector.setSignatureUsingVectors(vectors, phraseTextAndDimensionMap) documentWithDimensionsNotInVector.setSignatureUsingVectors(vectors, phraseTextAndDimensionMap) self.assertEqual(Signature('01'), documentWithDimensionsInVector.signature) self.assertEqual(Signature('10'), documentWithDimensionsNotInVector.signature)
def setUp(self): self.phraseVector = {'project':1, 'cluster':1, 'highdimensional':1, 'streams':1} self.phraseTextAndDimensionMap = TwoWayMap() self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'project', 0) self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 1) self.finalPhraseToIdMap = {'project': 0, 'cluster': 1, 'streams': 2, 'highdimensional': 3} settings['dimensions'] = 2
class UtilityMethodsTests(unittest.TestCase): def setUp(self): self.phraseVector = {'project':1, 'cluster':1, 'highdimensional':1, 'streams':1} self.phraseTextAndDimensionMap = TwoWayMap() self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'project', 0) self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 1) self.finalPhraseToIdMap = {'project': 0, 'cluster': 1, 'streams': 2, 'highdimensional': 3} settings['dimensions'] = 2 def test_updatePhraseTextAndDimensionsMap_PhraseMapHasLesserDimensions(self): settings['dimensions'] = 4 UtilityMethods.updatePhraseTextAndDimensionsMap(self.phraseVector, self.phraseTextAndDimensionMap, **settings) self.assertEqual(self.finalPhraseToIdMap, self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD)) def test_updatePhraseTextAndDimensionsMap_PhraseMapHasMaximumDimensions(self): UtilityMethods.updatePhraseTextAndDimensionsMap(self.phraseVector, self.phraseTextAndDimensionMap, **settings) for k in ['streams', 'highdimensional']: del self.finalPhraseToIdMap[k] self.assertEqual(self.finalPhraseToIdMap, self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD))
def __init__(self, **settings): self.settings = settings self.nearestNeighborThreshold = settings['nearest_neighbor_threshold'] self.unitVector = RandomGaussianUnitVector( dimensions=settings['dimensions'], mu=0, sigma=1) self.vectorPermutations = VectorPermutation.getPermutations( settings['signature_length'], settings['dimensions'], self.unitVector) # self.signaturePermutations = [SignaturePermutationWithTrie(settings['signature_length']) for i in range(settings['number_of_permutations'])] signatureType = settings.get('signature_type', 'signature_type_trie') if signatureType == 'signature_type_trie': self.signaturePermutations = [ SignaturePermutationWithTrie(settings['signature_length']) for i in range(settings['number_of_permutations']) ] else: self.signaturePermutations = [ SignaturePermutationWithSortedList( settings['signature_length']) for i in range(settings['number_of_permutations']) ] self.phraseTextAndDimensionMap = TwoWayMap() self.documentIdToDocumentMap = {}
class SignaturePermutationTests(unittest.TestCase): def setUp(self): self.dimension, self.signatureLength = 50, 23 self.phraseTextAndDimensionMap = TwoWayMap() for i in range(self.dimension): self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, i,i) self.unitRandomVectors = [VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1) for i in range(self.signatureLength)] self.doc1=Document(1, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1)) self.doc2=Document(2, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1)) self.doc1.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap); self.doc2.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap) self.pm = SignaturePermutationWithTrie(signatureLength=self.signatureLength) self.pm.addDocument(self.doc1) self.pm.addDocument(self.doc2) def test_addDocument_newKey(self): doc1=Document(1, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1)) doc1.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap) pm = SignaturePermutationWithTrie(signatureLength=self.signatureLength) pm.addDocument(doc1) self.assertEqual(pm.signatureTrie[doc1.signature.permutate(pm).to01()], set([1])) def test_addDocument_existingKey(self): newDocModifiedWithExistingSignature = Document(3, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1)) newDocModifiedWithExistingSignature.signature = Signature(self.doc1.signature.to01()) self.pm.addDocument(newDocModifiedWithExistingSignature) self.assertEqual(self.pm.signatureTrie[self.doc1.signature.permutate(self.pm).to01()], set([1, 3])) def test_getNearestDocument_usingAKeyAlreadyInTrie(self): self.assertEqual(self.pm.getNearestDocuments(self.doc1), set([1])) def test_getNearestDocument_usingANearbyKeyInTrie(self): digitReplacement = {'0': '1', '1': '0'} newDocWithANearbySignature = Document(3, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1)) exactSignature = self.doc1.signature.to01() newDocWithANearbySignature.signature = Signature(exactSignature[:-1]+digitReplacement[exactSignature[-1]]) self.assertNotEquals(self.doc1.signature.to01(), newDocWithANearbySignature.signature.to01()) self.assertEqual(self.pm.getNearestDocuments(newDocWithANearbySignature), set([1])) # This assertion can sometimes fail because of randomization. Run the tests again. It's OK! def test_getNearestDocument_emptyTrie(self): permutationWithEmptyTrie = SignaturePermutationWithTrie(signatureLength=self.signatureLength) self.assertEqual(permutationWithEmptyTrie.getNearestDocuments(self.doc1), set()) def test_removeDocument_documents(self): newDocModifiedWithExistingSignature = Document(3, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1)) newDocModifiedWithExistingSignature.signature = Signature(self.doc1.signature.to01()) self.pm.addDocument(newDocModifiedWithExistingSignature) self.assertEqual(self.pm.signatureTrie[self.doc1.signature.permutate(self.pm).to01()], set([1, 3])) self.pm.removeDocument(newDocModifiedWithExistingSignature) self.assertEqual(self.pm.signatureTrie[self.doc1.signature.permutate(self.pm).to01()], set([1])) self.pm.removeDocument(self.doc1) self.assertEqual(None, self.pm.signatureTrie.get(self.doc1.signature.permutate(self.pm).to01())) def test_resetSignatureTrie(self): self.assertTrue(len(self.pm.signatureTrie)>0) self.pm.resetSignatureDataStructure() self.assertTrue(len(self.pm.signatureTrie)==0)
def setUp(self): self.phraseVector = { 'project': 1, 'cluster': 1, 'highdimensional': 1, 'streams': 1 } self.phraseTextAndDimensionMap = TwoWayMap() self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'project', 0) self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 1) self.finalPhraseToIdMap = { 'project': 0, 'cluster': 1, 'streams': 2, 'highdimensional': 3 } settings['dimensions'] = 2
def setUp(self): self.phraseVector = {'project':1, 'cluster':1, 'highdimensional':1, 'streams':1} self.phraseTextAndDimensionMap = TwoWayMap() self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'project', 0) self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 1) self.phraseTextToPhraseObjectMap = {'project': Phrase('project', test_time, score=8), 'cluster': Phrase('cluster', test_time, score=8), 'abcd': Phrase('abcd', test_time-3*stream_settings['max_phrase_inactivity_time_in_seconds'], score=8)} self.vector = Vector({0:1, 1:1, 2:1, 3:1}) self.initial_max_dimensions = stream_settings['dimensions'] stream_settings['dimensions'] = 2
def setUp(self): self.dimension, self.signatureLength = 50, 23 self.phraseTextAndDimensionMap = TwoWayMap() for i in range(self.dimension): self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, i,i) self.unitRandomVectors = [VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1) for i in range(self.signatureLength)] self.doc1=Document(1, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1)) self.doc2=Document(2, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1)) self.doc1.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap); self.doc2.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap) self.pm = SignaturePermutationWithTrie(signatureLength=self.signatureLength) self.pm.addDocument(self.doc1) self.pm.addDocument(self.doc2)
class UtilityMethodsTests(unittest.TestCase): def setUp(self): self.phraseVector = { 'project': 1, 'cluster': 1, 'highdimensional': 1, 'streams': 1 } self.phraseTextAndDimensionMap = TwoWayMap() self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'project', 0) self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 1) self.finalPhraseToIdMap = { 'project': 0, 'cluster': 1, 'streams': 2, 'highdimensional': 3 } settings['dimensions'] = 2 def test_updatePhraseTextAndDimensionsMap_PhraseMapHasLesserDimensions( self): settings['dimensions'] = 4 UtilityMethods.updatePhraseTextAndDimensionsMap( self.phraseVector, self.phraseTextAndDimensionMap, **settings) self.assertEqual( self.finalPhraseToIdMap, self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD)) def test_updatePhraseTextAndDimensionsMap_PhraseMapHasMaximumDimensions( self): UtilityMethods.updatePhraseTextAndDimensionsMap( self.phraseVector, self.phraseTextAndDimensionMap, **settings) for k in ['streams', 'highdimensional']: del self.finalPhraseToIdMap[k] self.assertEqual( self.finalPhraseToIdMap, self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD))
def test_setSignatureUsingVectors(self): phraseTextAndDimensionMap = TwoWayMap() phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'a', 1) phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'b', 2) documentWithDimensionsInVector = Document(1, {'a': 1, 'b': 4}) documentWithDimensionsNotInVector = Document(1, {'a': 1, 'c': 4}) vectors = [ Vector({ 1: 3 / 5., 2: -4 / 5. }), Vector({ 1: -5 / 13., 2: 12 / 13. }) ] documentWithDimensionsInVector.setSignatureUsingVectors( vectors, phraseTextAndDimensionMap) documentWithDimensionsNotInVector.setSignatureUsingVectors( vectors, phraseTextAndDimensionMap) self.assertEqual(Signature('01'), documentWithDimensionsInVector.signature) self.assertEqual(Signature('10'), documentWithDimensionsNotInVector.signature)
def test_setSignatureUsingVectorPermutations(self): dimensions, signatureLength = 53, 13 phraseTextAndDimensionMap = TwoWayMap() for i in range(dimensions): phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, i, i) phraseTextAndDimensionMapWithMissingDimensions = TwoWayMap() for i in range(dimensions - 50): phraseTextAndDimensionMapWithMissingDimensions.set( TwoWayMap.MAP_FORWARD, i, i) unitVector = RandomGaussianUnitVector(dimensions=dimensions, mu=0, sigma=1) vectorPermutations = VectorPermutation.getPermutations( signatureLength, dimensions, unitVector) permutatedUnitVectors = [ unitVector.getPermutedVector(r) for r in vectorPermutations ] documentVector = VectorGenerator.getRandomGaussianUnitVector( dimension=dimensions, mu=0, sigma=1) documentWithSignatureByVectors = Document(1, documentVector) documentWithSignatureByVectorPermutations = Document(2, documentVector) documentWithSignatureByVectors.setSignatureUsingVectors( permutatedUnitVectors, phraseTextAndDimensionMap) documentWithSignatureByVectorPermutations.setSignatureUsingVectorPermutations( unitVector, vectorPermutations, phraseTextAndDimensionMap) self.assertEqual(documentWithSignatureByVectors.signature, documentWithSignatureByVectorPermutations.signature) documentWithSignatureByVectors.setSignatureUsingVectors( permutatedUnitVectors, phraseTextAndDimensionMapWithMissingDimensions) documentWithSignatureByVectorPermutations.setSignatureUsingVectorPermutations( unitVector, vectorPermutations, phraseTextAndDimensionMapWithMissingDimensions) self.assertEqual(documentWithSignatureByVectors.signature, documentWithSignatureByVectorPermutations.signature)
def __init__(self, **clustering_settings): self.thresholdForDocumentToBeInACluster = clustering_settings[ 'threshold_for_document_to_be_in_cluster'] self.unitVector = RandomGaussianUnitVector( dimensions=clustering_settings['dimensions'], mu=0, sigma=1) self.vectorPermutations = VectorPermutation.getPermutations( clustering_settings['signature_length'], clustering_settings['dimensions'], self.unitVector) signatureType = clustering_settings.get('signature_type', 'signature_type_trie') if signatureType == 'signature_type_trie': self.signaturePermutations = [ SignaturePermutationWithTrie( clustering_settings['signature_length']) for i in range(clustering_settings['number_of_permutations']) ] else: self.signaturePermutations = [ SignaturePermutationWithSortedList( clustering_settings['signature_length']) for i in range(clustering_settings['number_of_permutations']) ] self.phraseTextAndDimensionMap, self.clusters = TwoWayMap(), {} self.clustering_settings = clustering_settings
def test_setSignatureUsingVectorPermutations(self): dimensions, signatureLength = 53, 13 phraseTextAndDimensionMap = TwoWayMap() for i in range(dimensions): phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, i,i) phraseTextAndDimensionMapWithMissingDimensions = TwoWayMap() for i in range(dimensions-50): phraseTextAndDimensionMapWithMissingDimensions.set(TwoWayMap.MAP_FORWARD, i,i) unitVector = RandomGaussianUnitVector(dimensions=dimensions, mu=0, sigma=1) vectorPermutations = VectorPermutation.getPermutations(signatureLength, dimensions, unitVector) permutatedUnitVectors = [unitVector.getPermutedVector(r) for r in vectorPermutations] documentVector = VectorGenerator.getRandomGaussianUnitVector(dimension=dimensions, mu=0, sigma=1) documentWithSignatureByVectors=Document(1, documentVector) documentWithSignatureByVectorPermutations=Document(2, documentVector) documentWithSignatureByVectors.setSignatureUsingVectors(permutatedUnitVectors, phraseTextAndDimensionMap) documentWithSignatureByVectorPermutations.setSignatureUsingVectorPermutations(unitVector, vectorPermutations, phraseTextAndDimensionMap) self.assertEqual(documentWithSignatureByVectors.signature, documentWithSignatureByVectorPermutations.signature) documentWithSignatureByVectors.setSignatureUsingVectors(permutatedUnitVectors, phraseTextAndDimensionMapWithMissingDimensions) documentWithSignatureByVectorPermutations.setSignatureUsingVectorPermutations(unitVector, vectorPermutations, phraseTextAndDimensionMapWithMissingDimensions) self.assertEqual(documentWithSignatureByVectors.signature, documentWithSignatureByVectorPermutations.signature)
class UtilityMethodsTests(unittest.TestCase): def setUp(self): self.phraseVector = {'project':1, 'cluster':1, 'highdimensional':1, 'streams':1} self.phraseTextAndDimensionMap = TwoWayMap() self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'project', 0) self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 1) self.phraseTextToPhraseObjectMap = {'project': Phrase('project', test_time, score=8), 'cluster': Phrase('cluster', test_time, score=8), 'abcd': Phrase('abcd', test_time-3*stream_settings['max_phrase_inactivity_time_in_seconds'], score=8)} self.vector = Vector({0:1, 1:1, 2:1, 3:1}) self.initial_max_dimensions = stream_settings['dimensions'] stream_settings['dimensions'] = 2 def tearDown(self): stream_settings['dimensions'] = self.initial_max_dimensions def test_updatedPhraseObject_PhraseObjectScoresAreUpdatedCorrectly(self): UtilityMethods.updatePhraseTextToPhraseObject(self.phraseVector, test_time+timedelta(seconds=60), self.phraseTextToPhraseObjectMap, **stream_settings) self.assertEqual(5, len(self.phraseTextToPhraseObjectMap)) self.assertEqual(5, self.phraseTextToPhraseObjectMap['project'].score) self.assertEqual(1, self.phraseTextToPhraseObjectMap['streams'].score) def test_updatedPhraseObject_phrase_does_not_exist_in_phraseToIdMap_but_exists_in_phraseTextToPhraseObjectMap_with_dimensions_full(self): stream_settings['dimensions'] = 1 self.phraseTextAndDimensionMap.remove(TwoWayMap.MAP_FORWARD, 'cluster') UtilityMethods.updatePhraseTextToPhraseObject(self.phraseVector, test_time+timedelta(seconds=60), self.phraseTextToPhraseObjectMap, **stream_settings) self.assertEqual({'project':0}, self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD)) self.assertEqual(5, len(self.phraseTextToPhraseObjectMap)) self.assertEqual(5, self.phraseTextToPhraseObjectMap['project'].score) self.assertEqual(5, self.phraseTextToPhraseObjectMap['cluster'].score) self.assertEqual(1, self.phraseTextToPhraseObjectMap['streams'].score) def test_createOrAddNewPhraseObject(self): UtilityMethods.createOrAddNewPhraseObject('new_phrase', self.phraseTextToPhraseObjectMap, test_time, **stream_settings) UtilityMethods.createOrAddNewPhraseObject('project', self.phraseTextToPhraseObjectMap, test_time, **stream_settings) self.assertEqual(4, len(self.phraseTextToPhraseObjectMap)) self.assertEqual(1, self.phraseTextToPhraseObjectMap['new_phrase'].score) self.assertEqual(9, self.phraseTextToPhraseObjectMap['project'].score) def test_updateDimensions_when_phraseTextToIdMap_is_filled_to_max_dimensions(self): for phrase, score in zip(['added'], range(10,11)): self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase, test_time, score=score) UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap, self.phraseTextToPhraseObjectMap, test_time, **stream_settings) self.assertEqual({'project':0, 'added': 1}, self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD)) def test_updateDimensions_when_phraseTextToIdMap_is_filled_to_max_dimensions_and_entire_map_is_changed(self): for phrase, score in zip(['added', 'are'], range(10,12)): self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase, test_time, score=score) UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap, self.phraseTextToPhraseObjectMap, test_time, **stream_settings) self.assertEqual({'added':1, 'are': 0}, self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD)) def test_updateDimensions_when_phraseTextToIdMap_has_lesser_than_max_dimensions(self): stream_settings['dimensions'] = 4 for phrase, score in zip(['new', 'phrases', 'are', 'added'], range(7,11)): self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase, test_time, score=score) UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap, self.phraseTextToPhraseObjectMap, test_time, **stream_settings) self.assertEqual(set({'project':0, 'phrases': 1, 'are':2, 'added':3}), set(self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD))) self.assertEqual(4, len(self.phraseTextAndDimensionMap)) def test_updateDimensions_when_phrases_with_lower_id_are_removed_from_phraseTextToIdMap(self): stream_settings['dimensions'] = 3 for phrase, score in zip(['new', 'phrases', 'are'], range(100,103)): self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase, test_time, score=score) self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 2) self.phraseTextToPhraseObjectMap['cluster'].score=100 UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap, self.phraseTextToPhraseObjectMap, test_time, **stream_settings) self.assertEqual(range(3), sorted(self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD).values())) def test_updateDimensions_remove_old_phrases(self): originalTime=self.phraseTextToPhraseObjectMap['abcd'].latestOccuranceTime self.phraseTextToPhraseObjectMap['abcd'].latestOccuranceTime=test_time UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap, self.phraseTextToPhraseObjectMap, test_time, **stream_settings) self.assertTrue('abcd' in self.phraseTextToPhraseObjectMap) self.phraseTextToPhraseObjectMap['abcd'].latestOccuranceTime=originalTime UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap, self.phraseTextToPhraseObjectMap, test_time, **stream_settings) self.assertTrue('abcd' not in self.phraseTextToPhraseObjectMap) def test_updateDimensions_when_dimensions_have_to_be_removed(self): stream_settings['dimensions'] = 4 self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'abcdx', 2) self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'abcdxy', 3) for phrase, score in zip(['new_text'], range(7,8)): self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase, test_time, score=score) self.phraseTextToPhraseObjectMap['cluster'].latestOccuranceTime=test_time-3*stream_settings['max_phrase_inactivity_time_in_seconds'] UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap, self.phraseTextToPhraseObjectMap, test_time, **stream_settings) self.assertEqual(set({'project':0, 'new_text': 1}), set(self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD))) def test_checkCriticalErrorsInPhraseTextToIdMap_larger_than_expected_dimensions(self): self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'sdfsd', 3) print 'Ignore this message: ', self.assertRaises(SystemExit, UtilityMethods.checkCriticalErrorsInPhraseTextToIdMap, self.phraseTextAndDimensionMap, **stream_settings) def test_pruningConditionDeterministic(self): phrase1 = Phrase('dsf', test_time-3*stream_settings['max_phrase_inactivity_time_in_seconds'], 1) phrase2 = Phrase('dsf', test_time, 1) self.assertTrue(UtilityMethods.pruningConditionDeterministic(phrase1, test_time, **stream_settings)) self.assertFalse(UtilityMethods.pruningConditionDeterministic(phrase2, test_time, **stream_settings)) def test_pruningConditionRandom(self): phrase1 = Phrase('dsf', test_time-3*stream_settings['max_phrase_inactivity_time_in_seconds'], 1) phrase2 = Phrase('dsf', test_time, 1) self.assertTrue(UtilityMethods.pruningConditionRandom(phrase1, test_time, **stream_settings)) self.assertFalse(UtilityMethods.pruningConditionRandom(phrase2, test_time, **stream_settings)) def test_pruneUnnecessaryPhrases(self): phraseTextToPhraseObjectMap = {'dsf': Phrase('dsf', test_time-3*stream_settings['max_phrase_inactivity_time_in_seconds'], 1), 'abc': Phrase('abc', test_time, 1)} UtilityMethods.pruneUnnecessaryPhrases(phraseTextToPhraseObjectMap, test_time, UtilityMethods.pruningConditionRandom, **stream_settings) self.assertTrue('dsf' not in phraseTextToPhraseObjectMap) self.assertTrue('abc' in phraseTextToPhraseObjectMap)
class UtilityMethodsTests(unittest.TestCase): def setUp(self): self.phraseVector = { 'project': 1, 'cluster': 1, 'highdimensional': 1, 'streams': 1 } self.phraseTextAndDimensionMap = TwoWayMap() self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'project', 0) self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 1) self.phraseTextToPhraseObjectMap = { 'project': Phrase('project', test_time, score=8), 'cluster': Phrase('cluster', test_time, score=8), 'abcd': Phrase( 'abcd', test_time - 3 * stream_settings['max_phrase_inactivity_time_in_seconds'], score=8) } self.vector = Vector({0: 1, 1: 1, 2: 1, 3: 1}) self.initial_max_dimensions = stream_settings['dimensions'] stream_settings['dimensions'] = 2 def tearDown(self): stream_settings['dimensions'] = self.initial_max_dimensions def test_updatedPhraseObject_PhraseObjectScoresAreUpdatedCorrectly(self): UtilityMethods.updatePhraseTextToPhraseObject( self.phraseVector, test_time + timedelta(seconds=60), self.phraseTextToPhraseObjectMap, **stream_settings) self.assertEqual(5, len(self.phraseTextToPhraseObjectMap)) self.assertEqual(5, self.phraseTextToPhraseObjectMap['project'].score) self.assertEqual(1, self.phraseTextToPhraseObjectMap['streams'].score) def test_updatedPhraseObject_phrase_does_not_exist_in_phraseToIdMap_but_exists_in_phraseTextToPhraseObjectMap_with_dimensions_full( self): stream_settings['dimensions'] = 1 self.phraseTextAndDimensionMap.remove(TwoWayMap.MAP_FORWARD, 'cluster') UtilityMethods.updatePhraseTextToPhraseObject( self.phraseVector, test_time + timedelta(seconds=60), self.phraseTextToPhraseObjectMap, **stream_settings) self.assertEqual({'project': 0}, self.phraseTextAndDimensionMap.getMap( TwoWayMap.MAP_FORWARD)) self.assertEqual(5, len(self.phraseTextToPhraseObjectMap)) self.assertEqual(5, self.phraseTextToPhraseObjectMap['project'].score) self.assertEqual(5, self.phraseTextToPhraseObjectMap['cluster'].score) self.assertEqual(1, self.phraseTextToPhraseObjectMap['streams'].score) def test_createOrAddNewPhraseObject(self): UtilityMethods.createOrAddNewPhraseObject( 'new_phrase', self.phraseTextToPhraseObjectMap, test_time, **stream_settings) UtilityMethods.createOrAddNewPhraseObject( 'project', self.phraseTextToPhraseObjectMap, test_time, **stream_settings) self.assertEqual(4, len(self.phraseTextToPhraseObjectMap)) self.assertEqual(1, self.phraseTextToPhraseObjectMap['new_phrase'].score) self.assertEqual(9, self.phraseTextToPhraseObjectMap['project'].score) def test_updateDimensions_when_phraseTextToIdMap_is_filled_to_max_dimensions( self): for phrase, score in zip(['added'], range(10, 11)): self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase, test_time, score=score) UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap, self.phraseTextToPhraseObjectMap, test_time, **stream_settings) self.assertEqual({ 'project': 0, 'added': 1 }, self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD)) def test_updateDimensions_when_phraseTextToIdMap_is_filled_to_max_dimensions_and_entire_map_is_changed( self): for phrase, score in zip(['added', 'are'], range(10, 12)): self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase, test_time, score=score) UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap, self.phraseTextToPhraseObjectMap, test_time, **stream_settings) self.assertEqual({ 'added': 1, 'are': 0 }, self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD)) def test_updateDimensions_when_phraseTextToIdMap_has_lesser_than_max_dimensions( self): stream_settings['dimensions'] = 4 for phrase, score in zip(['new', 'phrases', 'are', 'added'], range(7, 11)): self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase, test_time, score=score) UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap, self.phraseTextToPhraseObjectMap, test_time, **stream_settings) self.assertEqual( set({ 'project': 0, 'phrases': 1, 'are': 2, 'added': 3 }), set(self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD))) self.assertEqual(4, len(self.phraseTextAndDimensionMap)) def test_updateDimensions_when_phrases_with_lower_id_are_removed_from_phraseTextToIdMap( self): stream_settings['dimensions'] = 3 for phrase, score in zip(['new', 'phrases', 'are'], range(100, 103)): self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase, test_time, score=score) self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 2) self.phraseTextToPhraseObjectMap['cluster'].score = 100 UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap, self.phraseTextToPhraseObjectMap, test_time, **stream_settings) self.assertEqual( range(3), sorted( self.phraseTextAndDimensionMap.getMap( TwoWayMap.MAP_FORWARD).values())) def test_updateDimensions_remove_old_phrases(self): originalTime = self.phraseTextToPhraseObjectMap[ 'abcd'].latestOccuranceTime self.phraseTextToPhraseObjectMap[ 'abcd'].latestOccuranceTime = test_time UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap, self.phraseTextToPhraseObjectMap, test_time, **stream_settings) self.assertTrue('abcd' in self.phraseTextToPhraseObjectMap) self.phraseTextToPhraseObjectMap[ 'abcd'].latestOccuranceTime = originalTime UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap, self.phraseTextToPhraseObjectMap, test_time, **stream_settings) self.assertTrue('abcd' not in self.phraseTextToPhraseObjectMap) def test_updateDimensions_when_dimensions_have_to_be_removed(self): stream_settings['dimensions'] = 4 self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'abcdx', 2) self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'abcdxy', 3) for phrase, score in zip(['new_text'], range(7, 8)): self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase, test_time, score=score) self.phraseTextToPhraseObjectMap[ 'cluster'].latestOccuranceTime = test_time - 3 * stream_settings[ 'max_phrase_inactivity_time_in_seconds'] UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap, self.phraseTextToPhraseObjectMap, test_time, **stream_settings) self.assertEqual( set({ 'project': 0, 'new_text': 1 }), set(self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD))) def test_checkCriticalErrorsInPhraseTextToIdMap_larger_than_expected_dimensions( self): self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'sdfsd', 3) print 'Ignore this message: ', self.assertRaises( SystemExit, UtilityMethods.checkCriticalErrorsInPhraseTextToIdMap, self.phraseTextAndDimensionMap, **stream_settings) def test_pruningConditionDeterministic(self): phrase1 = Phrase( 'dsf', test_time - 3 * stream_settings['max_phrase_inactivity_time_in_seconds'], 1) phrase2 = Phrase('dsf', test_time, 1) self.assertTrue( UtilityMethods.pruningConditionDeterministic( phrase1, test_time, **stream_settings)) self.assertFalse( UtilityMethods.pruningConditionDeterministic( phrase2, test_time, **stream_settings)) def test_pruningConditionRandom(self): phrase1 = Phrase( 'dsf', test_time - 3 * stream_settings['max_phrase_inactivity_time_in_seconds'], 1) phrase2 = Phrase('dsf', test_time, 1) self.assertTrue( UtilityMethods.pruningConditionRandom(phrase1, test_time, **stream_settings)) self.assertFalse( UtilityMethods.pruningConditionRandom(phrase2, test_time, **stream_settings)) def test_pruneUnnecessaryPhrases(self): phraseTextToPhraseObjectMap = { 'dsf': Phrase( 'dsf', test_time - 3 * stream_settings['max_phrase_inactivity_time_in_seconds'], 1), 'abc': Phrase('abc', test_time, 1) } UtilityMethods.pruneUnnecessaryPhrases( phraseTextToPhraseObjectMap, test_time, UtilityMethods.pruningConditionRandom, **stream_settings) self.assertTrue('dsf' not in phraseTextToPhraseObjectMap) self.assertTrue('abc' in phraseTextToPhraseObjectMap)
class SignaturePermutationTests(unittest.TestCase): def setUp(self): self.dimension, self.signatureLength = 50, 23 self.phraseTextAndDimensionMap = TwoWayMap() for i in range(self.dimension): self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, i, i) self.unitRandomVectors = [ VectorGenerator.getRandomGaussianUnitVector( dimension=self.dimension, mu=0, sigma=1) for i in range(self.signatureLength) ] self.doc1 = Document( 1, VectorGenerator.getRandomGaussianUnitVector( dimension=self.dimension, mu=0, sigma=1)) self.doc2 = Document( 2, VectorGenerator.getRandomGaussianUnitVector( dimension=self.dimension, mu=0, sigma=1)) self.doc1.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap) self.doc2.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap) self.pm = SignaturePermutationWithTrie( signatureLength=self.signatureLength) self.pm.addDocument(self.doc1) self.pm.addDocument(self.doc2) def test_addDocument_newKey(self): doc1 = Document( 1, VectorGenerator.getRandomGaussianUnitVector( dimension=self.dimension, mu=0, sigma=1)) doc1.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap) pm = SignaturePermutationWithTrie(signatureLength=self.signatureLength) pm.addDocument(doc1) self.assertEqual(pm.signatureTrie[doc1.signature.permutate(pm).to01()], set([1])) def test_addDocument_existingKey(self): newDocModifiedWithExistingSignature = Document( 3, VectorGenerator.getRandomGaussianUnitVector( dimension=self.dimension, mu=0, sigma=1)) newDocModifiedWithExistingSignature.signature = Signature( self.doc1.signature.to01()) self.pm.addDocument(newDocModifiedWithExistingSignature) self.assertEqual( self.pm.signatureTrie[self.doc1.signature.permutate( self.pm).to01()], set([1, 3])) def test_getNearestDocument_usingAKeyAlreadyInTrie(self): self.assertEqual(self.pm.getNearestDocuments(self.doc1), set([1])) def test_getNearestDocument_usingANearbyKeyInTrie(self): digitReplacement = {'0': '1', '1': '0'} newDocWithANearbySignature = Document( 3, VectorGenerator.getRandomGaussianUnitVector( dimension=self.dimension, mu=0, sigma=1)) exactSignature = self.doc1.signature.to01() newDocWithANearbySignature.signature = Signature( exactSignature[:-1] + digitReplacement[exactSignature[-1]]) self.assertNotEquals(self.doc1.signature.to01(), newDocWithANearbySignature.signature.to01()) self.assertEqual( self.pm.getNearestDocuments(newDocWithANearbySignature), set([1]) ) # This assertion can sometimes fail because of randomization. Run the tests again. It's OK! def test_getNearestDocument_emptyTrie(self): permutationWithEmptyTrie = SignaturePermutationWithTrie( signatureLength=self.signatureLength) self.assertEqual( permutationWithEmptyTrie.getNearestDocuments(self.doc1), set()) def test_removeDocument_documents(self): newDocModifiedWithExistingSignature = Document( 3, VectorGenerator.getRandomGaussianUnitVector( dimension=self.dimension, mu=0, sigma=1)) newDocModifiedWithExistingSignature.signature = Signature( self.doc1.signature.to01()) self.pm.addDocument(newDocModifiedWithExistingSignature) self.assertEqual( self.pm.signatureTrie[self.doc1.signature.permutate( self.pm).to01()], set([1, 3])) self.pm.removeDocument(newDocModifiedWithExistingSignature) self.assertEqual( self.pm.signatureTrie[self.doc1.signature.permutate( self.pm).to01()], set([1])) self.pm.removeDocument(self.doc1) self.assertEqual( None, self.pm.signatureTrie.get( self.doc1.signature.permutate(self.pm).to01())) def test_resetSignatureTrie(self): self.assertTrue(len(self.pm.signatureTrie) > 0) self.pm.resetSignatureDataStructure() self.assertTrue(len(self.pm.signatureTrie) == 0)