def testTokenSimilarity(self): params = SimHashDocumentEncoderParameters() params.size = 400 params.sparsity = 0.33 params.caseSensitivity = True # tokenSimilarity ON params.tokenSimilarity = True encoder1 = SimHashDocumentEncoder(params) output1 = SDR(params.size) output2 = SDR(params.size) output3 = SDR(params.size) output4 = SDR(params.size) encoder1.encode(testDoc1, output1) encoder1.encode(testDoc2, output2) encoder1.encode(testDoc3, output3) encoder1.encode(testDoc4, output4) assert(output3.getOverlap(output4) > output2.getOverlap(output3)) assert(output2.getOverlap(output3) > output1.getOverlap(output3)) assert(output1.getOverlap(output3) > output1.getOverlap(output4)) # tokenSimilarity OFF params.tokenSimilarity = False encoder2 = SimHashDocumentEncoder(params) output1.zero() output2.zero() output3.zero() output4.zero() encoder2.encode(testDoc1, output1) encoder2.encode(testDoc2, output2) encoder2.encode(testDoc3, output3) encoder2.encode(testDoc4, output4) assert(output1.getOverlap(output2) > output2.getOverlap(output3)) assert(output2.getOverlap(output3) > output3.getOverlap(output4)) assert(output3.getOverlap(output4) > output1.getOverlap(output3))
def testTokenCaseSensitivity(self): # Case-sensitivite strings testDocCase1 = [ "alpha", "bravo", "delta", "echo", "foxtrot", "hotel"] testDocCase2 = [ "ALPHA", "BRAVO", "DELTA", "ECHO", "FOXTROT", "HOTEL"] part = ["eCHo", "foXTROt", "hOtEl"] discard = ["AlPHa", "BRaVo", "dELTa"] vocab = {"EcHo": 1, "FOxtRoT": 1, "HoTeL": 1} # caseSensitivity ON params = SimHashDocumentEncoderParameters() params.size = 400 params.sparsity = 0.33 params.caseSensitivity = True encoder1 = SimHashDocumentEncoder(params) output1 = encoder1.encode(testDocCase1) output2 = encoder1.encode(testDocCase2) assert(output1 != output2) # caseSensitivity OFF params.caseSensitivity = False encoder2 = SimHashDocumentEncoder(params) output1 = encoder2.encode(testDocCase1) output2 = encoder2.encode(testDocCase2) assert(output1 == output2) # caseSensitivity=OFF +excludes params.excludes = discard encoder3 = SimHashDocumentEncoder(params) output3a = encoder3.encode(testDocCase1) output3b = encoder3.encode(part) assert(output3a == output3b) # caseSensitivity=OFF +vocabulary params4 = SimHashDocumentEncoderParameters() params4.size = 400 params4.sparsity = 0.33 params4.caseSensitivity = False params4.encodeOrphans = False params4.vocabulary = vocab encoder4 = SimHashDocumentEncoder(params4) output4a = encoder4.encode(testDocCase1) output4b = encoder4.encode(part) assert(output4a == output4b)