def testConstructor(self): params1 = SimHashDocumentEncoderParameters() params1.size = 400 params1.activeBits = 20 encoder1 = SimHashDocumentEncoder(params1) assert(encoder1) assert(encoder1.dimensions == [params1.size]) assert(encoder1.size == params1.size) assert(encoder1.parameters.size == params1.size) assert(encoder1.parameters.activeBits == params1.activeBits) assert(not encoder1.parameters.tokenSimilarity) # test bad encoder params - both activeBits and sparsity params2 = SimHashDocumentEncoderParameters() params2.size = 400 params2.activeBits = 20 params2.sparsity = 0.666 encoder2 = None assert(not encoder2) with self.assertRaises(RuntimeError): encoder2 = SimHashDocumentEncoder(params2) # test bad encoder params - neither activeBits or sparsity params3 = SimHashDocumentEncoderParameters() params3.size = 400 encoder3 = None assert(not encoder3) with self.assertRaises(RuntimeError): encoder3 = SimHashDocumentEncoder(params3) # test good encoder param - using 'sparsity' instead of 'activeBits' params4 = SimHashDocumentEncoderParameters() params4.size = 400 params4.sparsity = 0.05 encoder4 = SimHashDocumentEncoder(params4) assert(encoder4) assert(encoder4.dimensions == [params4.size]) assert(encoder4.size == params4.size) assert(encoder4.parameters.size == params4.size) assert(encoder4.parameters.activeBits == 20) assert(not encoder4.parameters.tokenSimilarity) # test bad encoder params - frequency should be ceiling > floor params5 = SimHashDocumentEncoderParameters() params5.size = 400 params5.sparsity = 0.05 params5.frequencyCeiling = 3 params5.frequencyFloor = 6 encoder5 = None with self.assertRaises(RuntimeError): encoder5 = SimHashDocumentEncoder(params5) assert(not encoder5)
def testEncoding(self): params = SimHashDocumentEncoderParameters() params.size = 400 params.activeBits = 20 # main call style - list encoder = SimHashDocumentEncoder(params) output = encoder.encode(testDoc1) assert(encoder.size == params.size) assert(output.size == params.size) assert(output.getSum() == params.activeBits) # simple alternate calling style - string encoder2 = SimHashDocumentEncoder(params) value2 = "abcde fghij klmno pqrst uvwxy" output2 = encoder2.encode(value2) assert(output == output2) # encoding empty values leads to output of zeros outputZ = SDR(params.size) outputZ.zero() output3 = encoder.encode([]) output4 = encoder.encode("") assert(output3 == outputZ) assert(output4 == outputZ)