def testTokenCaseSensitivity(self): # Case-sensitivite strings testDocCase1 = [ "alpha", "bravo", "delta", "echo", "foxtrot", "hotel"] testDocCase2 = [ "ALPHA", "BRAVO", "DELTA", "ECHO", "FOXTROT", "HOTEL"] part = ["eCHo", "foXTROt", "hOtEl"] discard = ["AlPHa", "BRaVo", "dELTa"] vocab = {"EcHo": 1, "FOxtRoT": 1, "HoTeL": 1} # caseSensitivity ON params = SimHashDocumentEncoderParameters() params.size = 400 params.sparsity = 0.33 params.caseSensitivity = True encoder1 = SimHashDocumentEncoder(params) output1 = encoder1.encode(testDocCase1) output2 = encoder1.encode(testDocCase2) assert(output1 != output2) # caseSensitivity OFF params.caseSensitivity = False encoder2 = SimHashDocumentEncoder(params) output1 = encoder2.encode(testDocCase1) output2 = encoder2.encode(testDocCase2) assert(output1 == output2) # caseSensitivity=OFF +excludes params.excludes = discard encoder3 = SimHashDocumentEncoder(params) output3a = encoder3.encode(testDocCase1) output3b = encoder3.encode(part) assert(output3a == output3b) # caseSensitivity=OFF +vocabulary params4 = SimHashDocumentEncoderParameters() params4.size = 400 params4.sparsity = 0.33 params4.caseSensitivity = False params4.encodeOrphans = False params4.vocabulary = vocab encoder4 = SimHashDocumentEncoder(params4) output4a = encoder4.encode(testDocCase1) output4b = encoder4.encode(part) assert(output4a == output4b)
def testExcludes(self): keepList = ["but", "it", "all", "stays", "the", "same"] nopeList = ["seasons", "change", "mad", "things", "rearrange"] fullList = keepList + nopeList params = SimHashDocumentEncoderParameters() params.size = 400 params.sparsity = 0.33 encoder1 = SimHashDocumentEncoder(params) output1 = encoder1.encode(fullList) encoder2 = SimHashDocumentEncoder(params) output2 = encoder2.encode(keepList) params.excludes = nopeList encoder3 = SimHashDocumentEncoder(params) output3 = encoder3.encode(fullList) assert(output1 != output2) # full != part assert(output1 != output3) # full != (full - nope) assert(output2 == output3) # part == (full - nope)