Пример #1
0
 def setUp(self):
     self.dimension, self.signatureLength = 50, 23
     self.phraseTextAndDimensionMap = TwoWayMap()
     for i in range(self.dimension):
         self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, i, i)
     self.unitRandomVectors = [
         VectorGenerator.getRandomGaussianUnitVector(
             dimension=self.dimension, mu=0, sigma=1)
         for i in range(self.signatureLength)
     ]
     self.doc1 = Document(
         1,
         VectorGenerator.getRandomGaussianUnitVector(
             dimension=self.dimension, mu=0, sigma=1))
     self.doc2 = Document(
         2,
         VectorGenerator.getRandomGaussianUnitVector(
             dimension=self.dimension, mu=0, sigma=1))
     self.doc1.setSignatureUsingVectors(self.unitRandomVectors,
                                        self.phraseTextAndDimensionMap)
     self.doc2.setSignatureUsingVectors(self.unitRandomVectors,
                                        self.phraseTextAndDimensionMap)
     self.pm = SignaturePermutationWithTrie(
         signatureLength=self.signatureLength)
     self.pm.addDocument(self.doc1)
     self.pm.addDocument(self.doc2)
Пример #2
0
 def setUp(self):
     self.phraseVector = {
         'project': 1,
         'cluster': 1,
         'highdimensional': 1,
         'streams': 1
     }
     self.phraseTextAndDimensionMap = TwoWayMap()
     self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'project', 0)
     self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 1)
     self.phraseTextToPhraseObjectMap = {
         'project':
         Phrase('project', test_time, score=8),
         'cluster':
         Phrase('cluster', test_time, score=8),
         'abcd':
         Phrase(
             'abcd',
             test_time -
             3 * stream_settings['max_phrase_inactivity_time_in_seconds'],
             score=8)
     }
     self.vector = Vector({0: 1, 1: 1, 2: 1, 3: 1})
     self.initial_max_dimensions = stream_settings['dimensions']
     stream_settings['dimensions'] = 2
    def __init__(self, **settings):
        self.settings = settings
        self.nearestNeighborThreshold = settings['nearest_neighbor_threshold']
        self.unitVector = RandomGaussianUnitVector(
            dimensions=settings['dimensions'], mu=0, sigma=1)
        self.vectorPermutations = VectorPermutation.getPermutations(
            settings['signature_length'], settings['dimensions'],
            self.unitVector)
        #        self.signaturePermutations = [SignaturePermutationWithTrie(settings['signature_length']) for i in range(settings['number_of_permutations'])]

        signatureType = settings.get('signature_type', 'signature_type_trie')
        if signatureType == 'signature_type_trie':
            self.signaturePermutations = [
                SignaturePermutationWithTrie(settings['signature_length'])
                for i in range(settings['number_of_permutations'])
            ]
        else:
            self.signaturePermutations = [
                SignaturePermutationWithSortedList(
                    settings['signature_length'])
                for i in range(settings['number_of_permutations'])
            ]

        self.phraseTextAndDimensionMap = TwoWayMap()
        self.documentIdToDocumentMap = {}
Пример #4
0
    def test_setSignatureUsingVectorPermutations(self):
        dimensions, signatureLength = 53, 13
        phraseTextAndDimensionMap = TwoWayMap()
        for i in range(dimensions):
            phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, i, i)
        phraseTextAndDimensionMapWithMissingDimensions = TwoWayMap()
        for i in range(dimensions - 50):
            phraseTextAndDimensionMapWithMissingDimensions.set(
                TwoWayMap.MAP_FORWARD, i, i)

        unitVector = RandomGaussianUnitVector(dimensions=dimensions,
                                              mu=0,
                                              sigma=1)
        vectorPermutations = VectorPermutation.getPermutations(
            signatureLength, dimensions, unitVector)
        permutatedUnitVectors = [
            unitVector.getPermutedVector(r) for r in vectorPermutations
        ]
        documentVector = VectorGenerator.getRandomGaussianUnitVector(
            dimension=dimensions, mu=0, sigma=1)
        documentWithSignatureByVectors = Document(1, documentVector)
        documentWithSignatureByVectorPermutations = Document(2, documentVector)
        documentWithSignatureByVectors.setSignatureUsingVectors(
            permutatedUnitVectors, phraseTextAndDimensionMap)
        documentWithSignatureByVectorPermutations.setSignatureUsingVectorPermutations(
            unitVector, vectorPermutations, phraseTextAndDimensionMap)
        self.assertEqual(documentWithSignatureByVectors.signature,
                         documentWithSignatureByVectorPermutations.signature)
        documentWithSignatureByVectors.setSignatureUsingVectors(
            permutatedUnitVectors,
            phraseTextAndDimensionMapWithMissingDimensions)
        documentWithSignatureByVectorPermutations.setSignatureUsingVectorPermutations(
            unitVector, vectorPermutations,
            phraseTextAndDimensionMapWithMissingDimensions)
        self.assertEqual(documentWithSignatureByVectors.signature,
                         documentWithSignatureByVectorPermutations.signature)
Пример #5
0
 def setUp(self):
     self.phraseVector = {
         'project': 1,
         'cluster': 1,
         'highdimensional': 1,
         'streams': 1
     }
     self.phraseTextAndDimensionMap = TwoWayMap()
     self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'project', 0)
     self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 1)
     self.finalPhraseToIdMap = {
         'project': 0,
         'cluster': 1,
         'streams': 2,
         'highdimensional': 3
     }
     settings['dimensions'] = 2
Пример #6
0
 def __init__(self, **clustering_settings):
     self.thresholdForDocumentToBeInACluster = clustering_settings[
         'threshold_for_document_to_be_in_cluster']
     self.unitVector = RandomGaussianUnitVector(
         dimensions=clustering_settings['dimensions'], mu=0, sigma=1)
     self.vectorPermutations = VectorPermutation.getPermutations(
         clustering_settings['signature_length'],
         clustering_settings['dimensions'], self.unitVector)
     signatureType = clustering_settings.get('signature_type',
                                             'signature_type_trie')
     if signatureType == 'signature_type_trie':
         self.signaturePermutations = [
             SignaturePermutationWithTrie(
                 clustering_settings['signature_length'])
             for i in range(clustering_settings['number_of_permutations'])
         ]
     else:
         self.signaturePermutations = [
             SignaturePermutationWithSortedList(
                 clustering_settings['signature_length'])
             for i in range(clustering_settings['number_of_permutations'])
         ]
     self.phraseTextAndDimensionMap, self.clusters = TwoWayMap(), {}
     self.clustering_settings = clustering_settings
Пример #7
0
 def test_setSignatureUsingVectors(self):
     phraseTextAndDimensionMap = TwoWayMap()
     phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'a', 1)
     phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'b', 2)
     documentWithDimensionsInVector = Document(1, {'a': 1, 'b': 4})
     documentWithDimensionsNotInVector = Document(1, {'a': 1, 'c': 4})
     vectors = [
         Vector({
             1: 3 / 5.,
             2: -4 / 5.
         }),
         Vector({
             1: -5 / 13.,
             2: 12 / 13.
         })
     ]
     documentWithDimensionsInVector.setSignatureUsingVectors(
         vectors, phraseTextAndDimensionMap)
     documentWithDimensionsNotInVector.setSignatureUsingVectors(
         vectors, phraseTextAndDimensionMap)
     self.assertEqual(Signature('01'),
                      documentWithDimensionsInVector.signature)
     self.assertEqual(Signature('10'),
                      documentWithDimensionsNotInVector.signature)