def testSerializeToFile(self): vocab = { "hear": 2, "nothing": 4, "but": 1, "a": 1, "rushing": 4, "sound": 3} document = [ "hear", "any", "sound", "sound", "louder", "but", "walls"] params = SimHashDocumentEncoderParameters() params.size = 400 params.sparsity = 0.33 params.encodeOrphans = True params.vocabulary = vocab enc1 = SimHashDocumentEncoder(params) # The SimHashDocumentEncoder now has some data in it, try serialization. file = "SimHashDocumentEncoder_test_save2.json" enc1.saveToFile(file, "JSON") output1 = enc1.encode(document) # change the parameters so we know the params were replaced from contents in file. # Note: we should have a constructor without parameters for this situation. params.size = 10 params.sparsity = 0.5 enc2 = SimHashDocumentEncoder(params) enc2.loadFromFile(file, "JSON") os.remove(file) output2 = enc2.encode(document) assert(enc1.size == enc2.size) assert(enc1.parameters.size == enc2.parameters.size) assert(enc1.parameters.activeBits == enc2.parameters.activeBits)
def testSerializeString(self): vocab = { "hear": 2, "nothing": 4, "but": 1, "a": 1, "rushing": 4, "sound": 3} document = [ "hear", "any", "sound", "sound", "louder", "but", "walls"] params = SimHashDocumentEncoderParameters() params.size = 400 params.sparsity = 0.33 params.encodeOrphans = True params.vocabulary = vocab enc1 = SimHashDocumentEncoder(params) serialized = enc1.writeToString() output1 = enc1.encode(document) params.size = 40 params.sparsity = 0.1 enc2 = SimHashDocumentEncoder(params) assert(enc1.size != enc2.size) assert(enc1.parameters.size != enc2.parameters.size) assert(enc1.parameters.activeBits != enc2.parameters.activeBits) enc2.loadFromString(serialized) output2 = enc1.encode(document) assert(enc1.size == enc2.size) assert(enc1.parameters.size == enc2.parameters.size) assert(enc1.parameters.activeBits == enc2.parameters.activeBits) assert(output1 == output2)
def testConstructor(self): params1 = SimHashDocumentEncoderParameters() params1.size = 400 params1.activeBits = 20 encoder1 = SimHashDocumentEncoder(params1) assert(encoder1) assert(encoder1.dimensions == [params1.size]) assert(encoder1.size == params1.size) assert(encoder1.parameters.size == params1.size) assert(encoder1.parameters.activeBits == params1.activeBits) assert(not encoder1.parameters.tokenSimilarity) # test bad encoder params - both activeBits and sparsity params2 = SimHashDocumentEncoderParameters() params2.size = 400 params2.activeBits = 20 params2.sparsity = 0.666 encoder2 = None assert(not encoder2) with self.assertRaises(RuntimeError): encoder2 = SimHashDocumentEncoder(params2) # test bad encoder params - neither activeBits or sparsity params3 = SimHashDocumentEncoderParameters() params3.size = 400 encoder3 = None assert(not encoder3) with self.assertRaises(RuntimeError): encoder3 = SimHashDocumentEncoder(params3) # test good encoder param - using 'sparsity' instead of 'activeBits' params4 = SimHashDocumentEncoderParameters() params4.size = 400 params4.sparsity = 0.05 encoder4 = SimHashDocumentEncoder(params4) assert(encoder4) assert(encoder4.dimensions == [params4.size]) assert(encoder4.size == params4.size) assert(encoder4.parameters.size == params4.size) assert(encoder4.parameters.activeBits == 20) assert(not encoder4.parameters.tokenSimilarity) # test bad encoder params - frequency should be ceiling > floor params5 = SimHashDocumentEncoderParameters() params5.size = 400 params5.sparsity = 0.05 params5.frequencyCeiling = 3 params5.frequencyFloor = 6 encoder5 = None with self.assertRaises(RuntimeError): encoder5 = SimHashDocumentEncoder(params5) assert(not encoder5)
def testUnicode(self): testDocUni1 = [ "\u0395\u0396\u0397\u0398\u0399", "\u0400\u0401\u0402\u0403\u0404", "\u0405\u0406\u0407\u0408\u0409"] testDocUni2 = [ "\u0395\u0396\u0397\u0398\u0399\u0410", "\u0400\u0401\u0402\u0403\u0404\u0410", "\u0405\u0406\u0407\u0408\u0409\u0410"] params = SimHashDocumentEncoderParameters() params.size = 400 params.sparsity = 0.33 # unicode 'tokenSimilarity' ON params.tokenSimilarity = True encoder1 = SimHashDocumentEncoder(params) output1 = SDR(params.size) output2 = SDR(params.size) encoder1.encode(testDocUni1, output1) encoder1.encode(testDocUni2, output2) assert(output1.getOverlap(output2) > 65) # unicode 'tokenSimilarity' OFF params.tokenSimilarity = False encoder2 = SimHashDocumentEncoder(params) output1.zero() output2.zero() encoder2.encode(testDocUni1, output1) encoder2.encode(testDocUni2, output2) assert(output1.getOverlap(output2) < 65)
def testTokenVocabulary(self): vocabulary = { "a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6, "g": 1, "h": 2, "i": 3, "j": 4, "k": 5, "l": 6} input1 = "a b c d e f" input2 = "a b c d e f t u w x y z" params = SimHashDocumentEncoderParameters() params.size = 400 params.sparsity = 0.33 params.vocabulary = vocabulary # vocabulary +encodeOrphans params.encodeOrphans = True encoder1 = SimHashDocumentEncoder(params) output1a = encoder1.encode(input1) output1b = encoder1.encode(input2) assert(output1a != output1b) # vocabulary -encodeOrphans params.encodeOrphans = False encoder2 = SimHashDocumentEncoder(params) output2a = encoder2.encode(input1) output2b = encoder2.encode(input2) assert(output2a == output2b)
def testTokenSimilarity(self): params = SimHashDocumentEncoderParameters() params.size = 400 params.sparsity = 0.33 params.caseSensitivity = True # tokenSimilarity ON params.tokenSimilarity = True encoder1 = SimHashDocumentEncoder(params) output1 = SDR(params.size) output2 = SDR(params.size) output3 = SDR(params.size) output4 = SDR(params.size) encoder1.encode(testDoc1, output1) encoder1.encode(testDoc2, output2) encoder1.encode(testDoc3, output3) encoder1.encode(testDoc4, output4) assert(output3.getOverlap(output4) > output2.getOverlap(output3)) assert(output2.getOverlap(output3) > output1.getOverlap(output3)) assert(output1.getOverlap(output3) > output1.getOverlap(output4)) # tokenSimilarity OFF params.tokenSimilarity = False encoder2 = SimHashDocumentEncoder(params) output1.zero() output2.zero() output3.zero() output4.zero() encoder2.encode(testDoc1, output1) encoder2.encode(testDoc2, output2) encoder2.encode(testDoc3, output3) encoder2.encode(testDoc4, output4) assert(output1.getOverlap(output2) > output2.getOverlap(output3)) assert(output2.getOverlap(output3) > output3.getOverlap(output4)) assert(output3.getOverlap(output4) > output1.getOverlap(output3))
def testEncoding(self): params = SimHashDocumentEncoderParameters() params.size = 400 params.activeBits = 20 # main call style - list encoder = SimHashDocumentEncoder(params) output = encoder.encode(testDoc1) assert(encoder.size == params.size) assert(output.size == params.size) assert(output.getSum() == params.activeBits) # simple alternate calling style - string encoder2 = SimHashDocumentEncoder(params) value2 = "abcde fghij klmno pqrst uvwxy" output2 = encoder2.encode(value2) assert(output == output2) # encoding empty values leads to output of zeros outputZ = SDR(params.size) outputZ.zero() output3 = encoder.encode([]) output4 = encoder.encode("") assert(output3 == outputZ) assert(output4 == outputZ)
def testTokenCaseSensitivity(self): # Case-sensitivite strings testDocCase1 = [ "alpha", "bravo", "delta", "echo", "foxtrot", "hotel"] testDocCase2 = [ "ALPHA", "BRAVO", "DELTA", "ECHO", "FOXTROT", "HOTEL"] part = ["eCHo", "foXTROt", "hOtEl"] discard = ["AlPHa", "BRaVo", "dELTa"] vocab = {"EcHo": 1, "FOxtRoT": 1, "HoTeL": 1} # caseSensitivity ON params = SimHashDocumentEncoderParameters() params.size = 400 params.sparsity = 0.33 params.caseSensitivity = True encoder1 = SimHashDocumentEncoder(params) output1 = encoder1.encode(testDocCase1) output2 = encoder1.encode(testDocCase2) assert(output1 != output2) # caseSensitivity OFF params.caseSensitivity = False encoder2 = SimHashDocumentEncoder(params) output1 = encoder2.encode(testDocCase1) output2 = encoder2.encode(testDocCase2) assert(output1 == output2) # caseSensitivity=OFF +excludes params.excludes = discard encoder3 = SimHashDocumentEncoder(params) output3a = encoder3.encode(testDocCase1) output3b = encoder3.encode(part) assert(output3a == output3b) # caseSensitivity=OFF +vocabulary params4 = SimHashDocumentEncoderParameters() params4.size = 400 params4.sparsity = 0.33 params4.caseSensitivity = False params4.encodeOrphans = False params4.vocabulary = vocab encoder4 = SimHashDocumentEncoder(params4) output4a = encoder4.encode(testDocCase1) output4b = encoder4.encode(part) assert(output4a == output4b)
def testDeterminism(self): GOLD = SDR(1000) GOLD.sparse = [ 2, 34, 37, 38, 69, 79, 114, 170, 200, 234, 254, 258, 279, 289, 291, 292, 295, 307, 321, 336, 345, 350, 361, 373, 378, 400, 450, 461, 462, 487, 520, 532, 539, 548, 576, 583, 616, 623, 626, 627, 663, 681, 695, 716, 794, 799, 830, 835, 837, 841] params = SimHashDocumentEncoderParameters() params.size = GOLD.size params.sparsity = 0.05 encoder = SimHashDocumentEncoder(params) current = encoder.encode("I came to the fork in the road") assert(current == GOLD)
def testFrequency(self): tokens = "a a a b b c d d d d e e f" # min 1 max 4 charTokens = "abbbbbbcccdefg aaaaaabccchijk aaabcccccclmno" # Test token frequency floor/ceiling params = SimHashDocumentEncoderParameters() params.size = 400 params.sparsity = 0.33 encoder1 = SimHashDocumentEncoder(params) output1 = encoder1.encode(tokens) params.frequencyFloor = 1 encoder2 = SimHashDocumentEncoder(params) output2 = encoder2.encode(tokens) params.frequencyFloor = 0 params.frequencyCeiling = 4 encoder3 = SimHashDocumentEncoder(params) output3 = encoder3.encode(tokens) assert(output1 != output2) assert(output1 != output3) assert(output2 != output3) # Test character frequency ceiling (only) params4 = SimHashDocumentEncoderParameters() params4.size = 400 params4.sparsity = 0.33 params4.tokenSimilarity = True encoder4 = SimHashDocumentEncoder(params4) output4 = encoder4.encode(charTokens) params4.frequencyCeiling = 3 encoder5 = SimHashDocumentEncoder(params4) output5 = encoder5.encode(charTokens) assert(output4 != output5)
def testBasicExampleUseCase(self): testDocEasy1 = "The sky is beautiful today" testDocEasy2 = "The sun is beautiful today" # similar up, differ down testDocEasy3 = "Who did my homework today" # setup params params = SimHashDocumentEncoderParameters() params.size = 400 params.sparsity = 0.33 # init encoder encoder = SimHashDocumentEncoder(params) # encode! output1 = encoder.encode(testDocEasy1) output2 = encoder.encode(testDocEasy2) output3 = encoder.encode(testDocEasy3) # encodings for Docs 1 and 2 should be more similar than the encodings # for Docs 2 and 3 (which should be more disparate). assert(output1.getOverlap(output2) > output2.getOverlap(output3))
def testTokenWeightMap(self): weights = { "aaa": 4, "bbb": 2, "ccc": 2, "ddd": 4, "eee": 2, "fff": 2, "sss": 1} doc1 = ["aaa", "bbb", "ccc", "ddd", "sss"] doc2 = ["eee", "bbb", "ccc", "fff", "sss"] doc3 = ["aaa", "eee", "fff", "ddd"] params = SimHashDocumentEncoderParameters() params.size = 400 params.sparsity = 0.33 params.tokenSimilarity = False params.encodeOrphans = False params.vocabulary = weights encoder = SimHashDocumentEncoder(params) output1 = encoder.encode(doc1) output2 = encoder.encode(doc2) output3 = encoder.encode(doc3) assert(output1.getOverlap(output3) > output1.getOverlap(output2)) assert(output1.getOverlap(output2) > output2.getOverlap(output3))
def testExcludes(self): keepList = ["but", "it", "all", "stays", "the", "same"] nopeList = ["seasons", "change", "mad", "things", "rearrange"] fullList = keepList + nopeList params = SimHashDocumentEncoderParameters() params.size = 400 params.sparsity = 0.33 encoder1 = SimHashDocumentEncoder(params) output1 = encoder1.encode(fullList) encoder2 = SimHashDocumentEncoder(params) output2 = encoder2.encode(keepList) params.excludes = nopeList encoder3 = SimHashDocumentEncoder(params) output3 = encoder3.encode(fullList) assert(output1 != output2) # full != part assert(output1 != output3) # full != (full - nope) assert(output2 == output3) # part == (full - nope)
def testSerializePickle(self): vocab = { "hear": 2, "nothing": 4, "but": 1, "a": 1, "rushing": 4, "sound": 3} document = [ "hear", "any", "sound", "sound", "louder", "but", "walls"] params = SimHashDocumentEncoderParameters() params.size = 400 params.sparsity = 0.33 params.encodeOrphans = True params.vocabulary = vocab enc1 = SimHashDocumentEncoder(params) pickled = pickle.dumps(enc1) output1 = enc1.encode(document) enc2 = pickle.loads(pickled) output2 = enc2.encode(document) assert(enc1.size == enc2.size) assert(enc1.parameters.size == enc2.parameters.size) assert(enc1.parameters.activeBits == enc2.parameters.activeBits) assert(output1 == output2)
def testStatistics(self): # 100 random simple English words run mass encoding stats against testCorpus = [ "find", "any", "new", "work", "part", "take", "get", "place", "made", "live", "where", "after", "back", "little", "only", "round", "man", "year", "came", "show", "every", "good", "me", "give", "our", "under", "name", "very", "through", "just", "form", "sentence", "great", "think", "say", "help", "low", "line", "differ", "turn", "cause", "much", "mean", "before", "move", "right", "boy", "old", "too", "same", "tell", "does", "set", "three", "want", "air", "well", "also", "play", "small", "end", "put", "home", "read", "hand", "port", "large", "spell", "add", "even", "land", "here", "must", "big", "high", "such", "follow", "act", "why", "ask", "men", "change", "went", "light", "kind", "off", "need", "house", "picture", "try", "us", "again", "animal", "point", "mother", "world", "near", "build", "self", "earth"] num_samples = 1000 # number of documents to run num_tokens = 10 # tokens per document # Case 1 = tokenSimilarity OFF params1 = SimHashDocumentEncoderParameters() params1.size = 400 params1.sparsity = 0.33 params1.tokenSimilarity = False encoder1 = SimHashDocumentEncoder(params1) # Case 2 = tokenSimilarity ON params2 = params1 params2.tokenSimilarity = True encoder2 = SimHashDocumentEncoder(params2) sdrs1 = [] sdrs2 = [] for _ in range(num_samples): document = [] for _ in range(num_tokens - 1): token = testCorpus[random.randint(0, len(testCorpus) - 1)] document.append(token) sdrs1.append(encoder1.encode(document)) sdrs2.append(encoder2.encode(document)) report1 = Metrics([encoder1.size], len(sdrs1) + 1) report2 = Metrics([encoder2.size], len(sdrs2) + 1) for sdr in sdrs1: report1.addData(sdr) for sdr in sdrs2: report2.addData(sdr) # Assertions for Case 1 = tokenSimilarity OFF assert(report1.activationFrequency.entropy() > 0.87) assert(report1.activationFrequency.min() > 0.01) assert(report1.activationFrequency.max() < 0.99) assert(report1.activationFrequency.mean() > params1.sparsity - 0.005) assert(report1.activationFrequency.mean() < params1.sparsity + 0.005) assert(report1.overlap.min() > 0.21) assert(report1.overlap.max() > 0.53) assert(report1.overlap.mean() > 0.38) assert(report1.sparsity.min() > params1.sparsity - 0.01) assert(report1.sparsity.max() < params1.sparsity + 0.01) assert(report1.sparsity.mean() > params1.sparsity - 0.005) assert(report1.sparsity.mean() < params1.sparsity + 0.005) # Assertions for Case 2 = tokenSimilarity ON assert(report2.activationFrequency.entropy() > 0.59) assert(report2.activationFrequency.min() >= 0) assert(report2.activationFrequency.max() <= 1) assert(report2.activationFrequency.mean() > params2.sparsity - 0.005) assert(report2.activationFrequency.mean() < params2.sparsity + 0.005) assert(report2.overlap.min() > 0.38) assert(report2.overlap.max() > 0.78) assert(report2.overlap.mean() > 0.61) assert(report2.sparsity.min() > params2.sparsity - 0.01) assert(report2.sparsity.max() < params2.sparsity + 0.01) assert(report2.sparsity.mean() > params2.sparsity - 0.005) assert(report2.sparsity.mean() < params2.sparsity + 0.005)