def testConstructor(self): params1 = SimHashDocumentEncoderParameters() params1.size = 400 params1.activeBits = 20 encoder1 = SimHashDocumentEncoder(params1) assert(encoder1) assert(encoder1.dimensions == [params1.size]) assert(encoder1.size == params1.size) assert(encoder1.parameters.size == params1.size) assert(encoder1.parameters.activeBits == params1.activeBits) assert(not encoder1.parameters.tokenSimilarity) # test bad encoder params - both activeBits and sparsity params2 = SimHashDocumentEncoderParameters() params2.size = 400 params2.activeBits = 20 params2.sparsity = 0.666 encoder2 = None assert(not encoder2) with self.assertRaises(RuntimeError): encoder2 = SimHashDocumentEncoder(params2) # test bad encoder params - neither activeBits or sparsity params3 = SimHashDocumentEncoderParameters() params3.size = 400 encoder3 = None assert(not encoder3) with self.assertRaises(RuntimeError): encoder3 = SimHashDocumentEncoder(params3) # test good encoder param - using 'sparsity' instead of 'activeBits' params4 = SimHashDocumentEncoderParameters() params4.size = 400 params4.sparsity = 0.05 encoder4 = SimHashDocumentEncoder(params4) assert(encoder4) assert(encoder4.dimensions == [params4.size]) assert(encoder4.size == params4.size) assert(encoder4.parameters.size == params4.size) assert(encoder4.parameters.activeBits == 20) assert(not encoder4.parameters.tokenSimilarity) # test bad encoder params - frequency should be ceiling > floor params5 = SimHashDocumentEncoderParameters() params5.size = 400 params5.sparsity = 0.05 params5.frequencyCeiling = 3 params5.frequencyFloor = 6 encoder5 = None with self.assertRaises(RuntimeError): encoder5 = SimHashDocumentEncoder(params5) assert(not encoder5)
def testFrequency(self): tokens = "a a a b b c d d d d e e f" # min 1 max 4 charTokens = "abbbbbbcccdefg aaaaaabccchijk aaabcccccclmno" # Test token frequency floor/ceiling params = SimHashDocumentEncoderParameters() params.size = 400 params.sparsity = 0.33 encoder1 = SimHashDocumentEncoder(params) output1 = encoder1.encode(tokens) params.frequencyFloor = 1 encoder2 = SimHashDocumentEncoder(params) output2 = encoder2.encode(tokens) params.frequencyFloor = 0 params.frequencyCeiling = 4 encoder3 = SimHashDocumentEncoder(params) output3 = encoder3.encode(tokens) assert(output1 != output2) assert(output1 != output3) assert(output2 != output3) # Test character frequency ceiling (only) params4 = SimHashDocumentEncoderParameters() params4.size = 400 params4.sparsity = 0.33 params4.tokenSimilarity = True encoder4 = SimHashDocumentEncoder(params4) output4 = encoder4.encode(charTokens) params4.frequencyCeiling = 3 encoder5 = SimHashDocumentEncoder(params4) output5 = encoder5.encode(charTokens) assert(output4 != output5)