Пример #1
0
    def testConstructor(self):
        params1 = SimHashDocumentEncoderParameters()
        params1.size = 400
        params1.activeBits = 20
        encoder1 = SimHashDocumentEncoder(params1)
        assert(encoder1)
        assert(encoder1.dimensions == [params1.size])
        assert(encoder1.size == params1.size)
        assert(encoder1.parameters.size == params1.size)
        assert(encoder1.parameters.activeBits == params1.activeBits)
        assert(not encoder1.parameters.tokenSimilarity)

        # test bad encoder params - both activeBits and sparsity
        params2 = SimHashDocumentEncoderParameters()
        params2.size = 400
        params2.activeBits = 20
        params2.sparsity = 0.666
        encoder2 = None
        assert(not encoder2)
        with self.assertRaises(RuntimeError):
            encoder2 = SimHashDocumentEncoder(params2)

        # test bad encoder params - neither activeBits or sparsity
        params3 = SimHashDocumentEncoderParameters()
        params3.size = 400
        encoder3 = None
        assert(not encoder3)
        with self.assertRaises(RuntimeError):
            encoder3 = SimHashDocumentEncoder(params3)

        # test good encoder param - using 'sparsity' instead of 'activeBits'
        params4 = SimHashDocumentEncoderParameters()
        params4.size = 400
        params4.sparsity = 0.05
        encoder4 = SimHashDocumentEncoder(params4)
        assert(encoder4)
        assert(encoder4.dimensions == [params4.size])
        assert(encoder4.size == params4.size)
        assert(encoder4.parameters.size == params4.size)
        assert(encoder4.parameters.activeBits == 20)
        assert(not encoder4.parameters.tokenSimilarity)

        # test bad encoder params - frequency should be ceiling > floor
        params5 = SimHashDocumentEncoderParameters()
        params5.size = 400
        params5.sparsity = 0.05
        params5.frequencyCeiling = 3
        params5.frequencyFloor = 6
        encoder5 = None
        with self.assertRaises(RuntimeError):
            encoder5 = SimHashDocumentEncoder(params5)
        assert(not encoder5)
Пример #2
0
    def testEncoding(self):
        params = SimHashDocumentEncoderParameters()
        params.size = 400
        params.activeBits = 20

        # main call style - list
        encoder = SimHashDocumentEncoder(params)
        output = encoder.encode(testDoc1)
        assert(encoder.size == params.size)
        assert(output.size == params.size)
        assert(output.getSum() == params.activeBits)

        # simple alternate calling style - string
        encoder2 = SimHashDocumentEncoder(params)
        value2 = "abcde fghij klmno pqrst uvwxy"
        output2 = encoder2.encode(value2)
        assert(output == output2)

        # encoding empty values leads to output of zeros
        outputZ = SDR(params.size)
        outputZ.zero()
        output3 = encoder.encode([])
        output4 = encoder.encode("")
        assert(output3 == outputZ)
        assert(output4 == outputZ)