예제 #1
0
    def testSerializeString(self):
        vocab = {
            "hear": 2, "nothing": 4, "but": 1, "a": 1, "rushing": 4,
            "sound": 3}
        document = [
            "hear", "any", "sound", "sound", "louder", "but", "walls"]

        params = SimHashDocumentEncoderParameters()
        params.size = 400
        params.sparsity = 0.33
        params.encodeOrphans = True
        params.vocabulary = vocab

        enc1 = SimHashDocumentEncoder(params)
        serialized = enc1.writeToString()
        output1 = enc1.encode(document)

        params.size = 40
        params.sparsity = 0.1
        enc2 = SimHashDocumentEncoder(params)

        assert(enc1.size != enc2.size)
        assert(enc1.parameters.size != enc2.parameters.size)
        assert(enc1.parameters.activeBits != enc2.parameters.activeBits)

        enc2.loadFromString(serialized)
        output2 = enc1.encode(document)

        assert(enc1.size == enc2.size)
        assert(enc1.parameters.size == enc2.parameters.size)
        assert(enc1.parameters.activeBits == enc2.parameters.activeBits)
        assert(output1 == output2)
예제 #2
0
    def testSerializeToFile(self):
        vocab = {
            "hear": 2, "nothing": 4, "but": 1, "a": 1, "rushing": 4,
            "sound": 3}
        document = [
            "hear", "any", "sound", "sound", "louder", "but", "walls"]

        params = SimHashDocumentEncoderParameters()
        params.size = 400
        params.sparsity = 0.33
        params.encodeOrphans = True
        params.vocabulary = vocab

        enc1 = SimHashDocumentEncoder(params)
        # The SimHashDocumentEncoder now has some data in it, try serialization.
        file = "SimHashDocumentEncoder_test_save2.json"
        enc1.saveToFile(file, "JSON")
        output1 = enc1.encode(document)
        
        # change the parameters so we know the params were replaced from contents in file.
        # Note: we should have a constructor without parameters for this situation.
        params.size = 10
        params.sparsity = 0.5  
        enc2 = SimHashDocumentEncoder(params)
        enc2.loadFromFile(file, "JSON")
        os.remove(file)
        
        output2 = enc2.encode(document)
        assert(enc1.size == enc2.size)
        assert(enc1.parameters.size == enc2.parameters.size)
        assert(enc1.parameters.activeBits == enc2.parameters.activeBits)
예제 #3
0
    def testConstructor(self):
        params1 = SimHashDocumentEncoderParameters()
        params1.size = 400
        params1.activeBits = 20
        encoder1 = SimHashDocumentEncoder(params1)
        assert(encoder1)
        assert(encoder1.dimensions == [params1.size])
        assert(encoder1.size == params1.size)
        assert(encoder1.parameters.size == params1.size)
        assert(encoder1.parameters.activeBits == params1.activeBits)
        assert(not encoder1.parameters.tokenSimilarity)

        # test bad encoder params - both activeBits and sparsity
        params2 = SimHashDocumentEncoderParameters()
        params2.size = 400
        params2.activeBits = 20
        params2.sparsity = 0.666
        encoder2 = None
        assert(not encoder2)
        with self.assertRaises(RuntimeError):
            encoder2 = SimHashDocumentEncoder(params2)

        # test bad encoder params - neither activeBits or sparsity
        params3 = SimHashDocumentEncoderParameters()
        params3.size = 400
        encoder3 = None
        assert(not encoder3)
        with self.assertRaises(RuntimeError):
            encoder3 = SimHashDocumentEncoder(params3)

        # test good encoder param - using 'sparsity' instead of 'activeBits'
        params4 = SimHashDocumentEncoderParameters()
        params4.size = 400
        params4.sparsity = 0.05
        encoder4 = SimHashDocumentEncoder(params4)
        assert(encoder4)
        assert(encoder4.dimensions == [params4.size])
        assert(encoder4.size == params4.size)
        assert(encoder4.parameters.size == params4.size)
        assert(encoder4.parameters.activeBits == 20)
        assert(not encoder4.parameters.tokenSimilarity)

        # test bad encoder params - frequency should be ceiling > floor
        params5 = SimHashDocumentEncoderParameters()
        params5.size = 400
        params5.sparsity = 0.05
        params5.frequencyCeiling = 3
        params5.frequencyFloor = 6
        encoder5 = None
        with self.assertRaises(RuntimeError):
            encoder5 = SimHashDocumentEncoder(params5)
        assert(not encoder5)
예제 #4
0
    def testUnicode(self):
        testDocUni1 = [
          "\u0395\u0396\u0397\u0398\u0399",
          "\u0400\u0401\u0402\u0403\u0404",
          "\u0405\u0406\u0407\u0408\u0409"]
        testDocUni2 = [
          "\u0395\u0396\u0397\u0398\u0399\u0410",
          "\u0400\u0401\u0402\u0403\u0404\u0410",
          "\u0405\u0406\u0407\u0408\u0409\u0410"]

        params = SimHashDocumentEncoderParameters()
        params.size = 400
        params.sparsity = 0.33

        # unicode 'tokenSimilarity' ON
        params.tokenSimilarity = True
        encoder1 = SimHashDocumentEncoder(params)
        output1 = SDR(params.size)
        output2 = SDR(params.size)
        encoder1.encode(testDocUni1, output1)
        encoder1.encode(testDocUni2, output2)
        assert(output1.getOverlap(output2) > 65)

        # unicode 'tokenSimilarity' OFF
        params.tokenSimilarity = False
        encoder2 = SimHashDocumentEncoder(params)
        output1.zero()
        output2.zero()
        encoder2.encode(testDocUni1, output1)
        encoder2.encode(testDocUni2, output2)
        assert(output1.getOverlap(output2) < 65)
예제 #5
0
    def testTokenVocabulary(self):
        vocabulary = {
          "a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6,
          "g": 1, "h": 2, "i": 3, "j": 4, "k": 5, "l": 6}
        input1 = "a b c d e f"
        input2 = "a b c d e f t u w x y z"

        params = SimHashDocumentEncoderParameters()
        params.size = 400
        params.sparsity = 0.33
        params.vocabulary = vocabulary

        # vocabulary +encodeOrphans
        params.encodeOrphans = True
        encoder1 = SimHashDocumentEncoder(params)
        output1a = encoder1.encode(input1)
        output1b = encoder1.encode(input2)
        assert(output1a != output1b)

        # vocabulary -encodeOrphans
        params.encodeOrphans = False
        encoder2 = SimHashDocumentEncoder(params)
        output2a = encoder2.encode(input1)
        output2b = encoder2.encode(input2)
        assert(output2a == output2b)
예제 #6
0
    def testTokenSimilarity(self):
        params = SimHashDocumentEncoderParameters()
        params.size = 400
        params.sparsity = 0.33
        params.caseSensitivity = True

        # tokenSimilarity ON
        params.tokenSimilarity = True
        encoder1 = SimHashDocumentEncoder(params)
        output1 = SDR(params.size)
        output2 = SDR(params.size)
        output3 = SDR(params.size)
        output4 = SDR(params.size)
        encoder1.encode(testDoc1, output1)
        encoder1.encode(testDoc2, output2)
        encoder1.encode(testDoc3, output3)
        encoder1.encode(testDoc4, output4)
        assert(output3.getOverlap(output4) > output2.getOverlap(output3))
        assert(output2.getOverlap(output3) > output1.getOverlap(output3))
        assert(output1.getOverlap(output3) > output1.getOverlap(output4))

        # tokenSimilarity OFF
        params.tokenSimilarity = False
        encoder2 = SimHashDocumentEncoder(params)
        output1.zero()
        output2.zero()
        output3.zero()
        output4.zero()
        encoder2.encode(testDoc1, output1)
        encoder2.encode(testDoc2, output2)
        encoder2.encode(testDoc3, output3)
        encoder2.encode(testDoc4, output4)
        assert(output1.getOverlap(output2) > output2.getOverlap(output3))
        assert(output2.getOverlap(output3) > output3.getOverlap(output4))
        assert(output3.getOverlap(output4) > output1.getOverlap(output3))
예제 #7
0
    def testTokenCaseSensitivity(self):
        # Case-sensitivite strings
        testDocCase1 = [
            "alpha", "bravo",  "delta",  "echo",  "foxtrot", "hotel"]
        testDocCase2 = [
            "ALPHA", "BRAVO",  "DELTA",  "ECHO",  "FOXTROT", "HOTEL"]
        part = ["eCHo", "foXTROt", "hOtEl"]
        discard = ["AlPHa", "BRaVo", "dELTa"]
        vocab = {"EcHo": 1, "FOxtRoT": 1, "HoTeL": 1}

        # caseSensitivity ON
        params = SimHashDocumentEncoderParameters()
        params.size = 400
        params.sparsity = 0.33
        params.caseSensitivity = True
        encoder1 = SimHashDocumentEncoder(params)
        output1 = encoder1.encode(testDocCase1)
        output2 = encoder1.encode(testDocCase2)
        assert(output1 != output2)

        # caseSensitivity OFF
        params.caseSensitivity = False
        encoder2 = SimHashDocumentEncoder(params)
        output1 = encoder2.encode(testDocCase1)
        output2 = encoder2.encode(testDocCase2)
        assert(output1 == output2)

        # caseSensitivity=OFF +excludes
        params.excludes = discard
        encoder3 = SimHashDocumentEncoder(params)
        output3a = encoder3.encode(testDocCase1)
        output3b = encoder3.encode(part)
        assert(output3a == output3b)

        # caseSensitivity=OFF +vocabulary
        params4 = SimHashDocumentEncoderParameters()
        params4.size = 400
        params4.sparsity = 0.33
        params4.caseSensitivity = False
        params4.encodeOrphans = False
        params4.vocabulary = vocab
        encoder4 = SimHashDocumentEncoder(params4)
        output4a = encoder4.encode(testDocCase1)
        output4b = encoder4.encode(part)
        assert(output4a == output4b)
예제 #8
0
 def testDeterminism(self):
     GOLD = SDR(1000)
     GOLD.sparse = [
         2, 34, 37, 38, 69, 79, 114, 170, 200, 234, 254, 258, 279, 289, 291,
         292, 295, 307, 321, 336, 345, 350, 361, 373, 378, 400, 450, 461,
         462, 487, 520, 532, 539, 548, 576, 583, 616, 623, 626, 627, 663,
         681, 695, 716, 794, 799, 830, 835, 837, 841]
     params = SimHashDocumentEncoderParameters()
     params.size = GOLD.size
     params.sparsity = 0.05
     encoder = SimHashDocumentEncoder(params)
     current = encoder.encode("I came to the fork in the road")
     assert(current == GOLD)
예제 #9
0
    def testFrequency(self):
        tokens = "a a a b b c d d d d e e f"  # min 1 max 4
        charTokens = "abbbbbbcccdefg aaaaaabccchijk aaabcccccclmno"

        # Test token frequency floor/ceiling
        params = SimHashDocumentEncoderParameters()
        params.size = 400
        params.sparsity = 0.33
        encoder1 = SimHashDocumentEncoder(params)
        output1 = encoder1.encode(tokens)

        params.frequencyFloor = 1
        encoder2 = SimHashDocumentEncoder(params)
        output2 = encoder2.encode(tokens)

        params.frequencyFloor = 0
        params.frequencyCeiling = 4
        encoder3 = SimHashDocumentEncoder(params)
        output3 = encoder3.encode(tokens)

        assert(output1 != output2)
        assert(output1 != output3)
        assert(output2 != output3)

        # Test character frequency ceiling (only)
        params4 = SimHashDocumentEncoderParameters()
        params4.size = 400
        params4.sparsity = 0.33
        params4.tokenSimilarity = True
        encoder4 = SimHashDocumentEncoder(params4)
        output4 = encoder4.encode(charTokens)

        params4.frequencyCeiling = 3
        encoder5 = SimHashDocumentEncoder(params4)
        output5 = encoder5.encode(charTokens)

        assert(output4 != output5)
예제 #10
0
    def testBasicExampleUseCase(self):
        testDocEasy1 = "The sky is beautiful today"
        testDocEasy2 = "The sun is beautiful today"  # similar up, differ down
        testDocEasy3 = "Who did my homework  today"

        # setup params
        params = SimHashDocumentEncoderParameters()
        params.size = 400
        params.sparsity = 0.33

        # init encoder
        encoder = SimHashDocumentEncoder(params)

        # encode!
        output1 = encoder.encode(testDocEasy1)
        output2 = encoder.encode(testDocEasy2)
        output3 = encoder.encode(testDocEasy3)

        # encodings for Docs 1 and 2 should be more similar than the encodings
        #   for Docs 2 and 3 (which should be more disparate).
        assert(output1.getOverlap(output2) > output2.getOverlap(output3))
예제 #11
0
    def testTokenWeightMap(self):
        weights = {
          "aaa": 4, "bbb": 2, "ccc": 2, "ddd": 4, "eee": 2, "fff": 2, "sss": 1}
        doc1 = ["aaa", "bbb", "ccc", "ddd", "sss"]
        doc2 = ["eee", "bbb", "ccc", "fff", "sss"]
        doc3 = ["aaa", "eee", "fff", "ddd"]

        params = SimHashDocumentEncoderParameters()
        params.size = 400
        params.sparsity = 0.33
        params.tokenSimilarity = False
        params.encodeOrphans = False
        params.vocabulary = weights
        encoder = SimHashDocumentEncoder(params)

        output1 = encoder.encode(doc1)
        output2 = encoder.encode(doc2)
        output3 = encoder.encode(doc3)

        assert(output1.getOverlap(output3) > output1.getOverlap(output2))
        assert(output1.getOverlap(output2) > output2.getOverlap(output3))
예제 #12
0
    def testExcludes(self):
        keepList = ["but", "it", "all", "stays", "the", "same"]
        nopeList = ["seasons", "change", "mad", "things", "rearrange"]
        fullList = keepList + nopeList

        params = SimHashDocumentEncoderParameters()
        params.size = 400
        params.sparsity = 0.33

        encoder1 = SimHashDocumentEncoder(params)
        output1 = encoder1.encode(fullList)

        encoder2 = SimHashDocumentEncoder(params)
        output2 = encoder2.encode(keepList)

        params.excludes = nopeList
        encoder3 = SimHashDocumentEncoder(params)
        output3 = encoder3.encode(fullList)

        assert(output1 != output2)  # full != part
        assert(output1 != output3)  # full != (full - nope)
        assert(output2 == output3)  # part == (full - nope)
예제 #13
0
    def testSerializePickle(self):
        vocab = {
            "hear": 2, "nothing": 4, "but": 1, "a": 1, "rushing": 4,
            "sound": 3}
        document = [
            "hear", "any", "sound", "sound", "louder", "but", "walls"]

        params = SimHashDocumentEncoderParameters()
        params.size = 400
        params.sparsity = 0.33
        params.encodeOrphans = True
        params.vocabulary = vocab

        enc1 = SimHashDocumentEncoder(params)
        pickled = pickle.dumps(enc1)
        output1 = enc1.encode(document)

        enc2 = pickle.loads(pickled)
        output2 = enc2.encode(document)

        assert(enc1.size == enc2.size)
        assert(enc1.parameters.size == enc2.parameters.size)
        assert(enc1.parameters.activeBits == enc2.parameters.activeBits)
        assert(output1 == output2)
예제 #14
0
    def testStatistics(self):
        # 100 random simple English words run mass encoding stats against
        testCorpus = [
            "find", "any", "new", "work", "part", "take", "get", "place",
            "made", "live", "where", "after", "back", "little", "only",
            "round", "man", "year", "came", "show", "every", "good", "me",
            "give", "our", "under", "name", "very", "through", "just", "form",
            "sentence", "great", "think", "say", "help", "low", "line",
            "differ", "turn", "cause", "much", "mean", "before", "move",
            "right", "boy", "old", "too", "same", "tell", "does", "set",
            "three", "want", "air", "well", "also", "play", "small", "end",
            "put", "home", "read", "hand", "port", "large", "spell", "add",
            "even", "land", "here", "must", "big", "high", "such", "follow",
            "act", "why", "ask", "men", "change", "went", "light", "kind",
            "off", "need", "house", "picture", "try", "us", "again", "animal",
            "point", "mother", "world", "near", "build", "self", "earth"]
        num_samples = 1000  # number of documents to run
        num_tokens = 10     # tokens per document

        # Case 1 = tokenSimilarity OFF
        params1 = SimHashDocumentEncoderParameters()
        params1.size = 400
        params1.sparsity = 0.33
        params1.tokenSimilarity = False
        encoder1 = SimHashDocumentEncoder(params1)

        # Case 2 = tokenSimilarity ON
        params2 = params1
        params2.tokenSimilarity = True
        encoder2 = SimHashDocumentEncoder(params2)

        sdrs1 = []
        sdrs2 = []
        for _ in range(num_samples):
            document = []
            for _ in range(num_tokens - 1):
                token = testCorpus[random.randint(0, len(testCorpus) - 1)]
                document.append(token)
            sdrs1.append(encoder1.encode(document))
            sdrs2.append(encoder2.encode(document))

        report1 = Metrics([encoder1.size], len(sdrs1) + 1)
        report2 = Metrics([encoder2.size], len(sdrs2) + 1)

        for sdr in sdrs1:
            report1.addData(sdr)
        for sdr in sdrs2:
            report2.addData(sdr)

        # Assertions for Case 1 = tokenSimilarity OFF
        assert(report1.activationFrequency.entropy() > 0.87)
        assert(report1.activationFrequency.min() > 0.01)
        assert(report1.activationFrequency.max() < 0.99)
        assert(report1.activationFrequency.mean() > params1.sparsity - 0.005)
        assert(report1.activationFrequency.mean() < params1.sparsity + 0.005)
        assert(report1.overlap.min() > 0.21)
        assert(report1.overlap.max() > 0.53)
        assert(report1.overlap.mean() > 0.38)
        assert(report1.sparsity.min() > params1.sparsity - 0.01)
        assert(report1.sparsity.max() < params1.sparsity + 0.01)
        assert(report1.sparsity.mean() > params1.sparsity - 0.005)
        assert(report1.sparsity.mean() < params1.sparsity + 0.005)

        # Assertions for Case 2 = tokenSimilarity ON
        assert(report2.activationFrequency.entropy() > 0.59)
        assert(report2.activationFrequency.min() >= 0)
        assert(report2.activationFrequency.max() <= 1)
        assert(report2.activationFrequency.mean() > params2.sparsity - 0.005)
        assert(report2.activationFrequency.mean() < params2.sparsity + 0.005)
        assert(report2.overlap.min() > 0.38)
        assert(report2.overlap.max() > 0.78)
        assert(report2.overlap.mean() > 0.61)
        assert(report2.sparsity.min() > params2.sparsity - 0.01)
        assert(report2.sparsity.max() < params2.sparsity + 0.01)
        assert(report2.sparsity.mean() > params2.sparsity - 0.005)
        assert(report2.sparsity.mean() < params2.sparsity + 0.005)