예제 #1
0
    def testTokenCaseSensitivity(self):
        # Case-sensitivite strings
        testDocCase1 = [
            "alpha", "bravo",  "delta",  "echo",  "foxtrot", "hotel"]
        testDocCase2 = [
            "ALPHA", "BRAVO",  "DELTA",  "ECHO",  "FOXTROT", "HOTEL"]
        part = ["eCHo", "foXTROt", "hOtEl"]
        discard = ["AlPHa", "BRaVo", "dELTa"]
        vocab = {"EcHo": 1, "FOxtRoT": 1, "HoTeL": 1}

        # caseSensitivity ON
        params = SimHashDocumentEncoderParameters()
        params.size = 400
        params.sparsity = 0.33
        params.caseSensitivity = True
        encoder1 = SimHashDocumentEncoder(params)
        output1 = encoder1.encode(testDocCase1)
        output2 = encoder1.encode(testDocCase2)
        assert(output1 != output2)

        # caseSensitivity OFF
        params.caseSensitivity = False
        encoder2 = SimHashDocumentEncoder(params)
        output1 = encoder2.encode(testDocCase1)
        output2 = encoder2.encode(testDocCase2)
        assert(output1 == output2)

        # caseSensitivity=OFF +excludes
        params.excludes = discard
        encoder3 = SimHashDocumentEncoder(params)
        output3a = encoder3.encode(testDocCase1)
        output3b = encoder3.encode(part)
        assert(output3a == output3b)

        # caseSensitivity=OFF +vocabulary
        params4 = SimHashDocumentEncoderParameters()
        params4.size = 400
        params4.sparsity = 0.33
        params4.caseSensitivity = False
        params4.encodeOrphans = False
        params4.vocabulary = vocab
        encoder4 = SimHashDocumentEncoder(params4)
        output4a = encoder4.encode(testDocCase1)
        output4b = encoder4.encode(part)
        assert(output4a == output4b)
예제 #2
0
    def testExcludes(self):
        keepList = ["but", "it", "all", "stays", "the", "same"]
        nopeList = ["seasons", "change", "mad", "things", "rearrange"]
        fullList = keepList + nopeList

        params = SimHashDocumentEncoderParameters()
        params.size = 400
        params.sparsity = 0.33

        encoder1 = SimHashDocumentEncoder(params)
        output1 = encoder1.encode(fullList)

        encoder2 = SimHashDocumentEncoder(params)
        output2 = encoder2.encode(keepList)

        params.excludes = nopeList
        encoder3 = SimHashDocumentEncoder(params)
        output3 = encoder3.encode(fullList)

        assert(output1 != output2)  # full != part
        assert(output1 != output3)  # full != (full - nope)
        assert(output2 == output3)  # part == (full - nope)