예제 #1
0
    def testLSH(self):
        strings = [
                   "abcdefghijklmnopqrstuvwxyz",
                   "abcdefghijklmnopqrstuvw",
                   "defghijklmnopqrstuvw",
                   "zyxwvutsrqponmlkjihgfedcba",
                   "1abcdefghijklmnopuvw1",
                   "123456789",
                   "012345678",
                   "234567890",
                   ]
        for i, a in enumerate(strings):
            for j, b in enumerate(strings[i+1:]):
                print "'%s' (%d) <=> (%d)'%s': %f" % (a, i,j+i+1, b, 1-jaccard_distance(set(a),set(b)))

        random.seed(12345)
        lsh = LSHCache(shingler=Shingler(1))
        self.assertListEqual([set(),
                              set([0]),
                              set([0,1]),
                              set([0,1,2]),
                              set([0,1,2,3]),
                              set(),
                              set([5]),
                              set([5,6])], lsh.insert_batch(strings))
예제 #2
0
    def testExample(self):
        docs = [
                "lipstick on a pig",
                "you can put lipstick on a pig",
                "you may put lipstick on a pig but it's still a pig",
                "you can put lipstick on a pig it's still a pig",
                "i think they put some lipstick on a pig but it's still a pig",
                "putting lipstick on a pig",
                "you know you can put lipstick on a pig",
                "they were going to send us binders full of women",
                "they were going to send us binders of women",
                "a b c d e f",
                "a b c d f"]

        # least strict
        random.seed(12345)
        cache = LSHCache(b=50,r=2)
        self.assertListEqual([set(),
                              set([0]),
                              set([0,1]),
                              set([0,1,2]),
                              set([0,1,2,3]),
                              set([0,1,2,3,4]),
                              set([0,1,2,3,4,5]),
                              set(),
                              set([7]),
                              set(),
                              set([9])],
                              cache.insert_batch([doc.split() for doc in docs]))

        # stricter
        random.seed(12345)
        cache = LSHCache(b=25,r=4)
        self.assertListEqual([set(),
                              set([0]),
                              set(),
                              set([1]),
                              set([2]),
                              set([0,1]),
                              set([0,1,5]),
                              set(),
                              set([7]),
                              set(),
                              set([9])],
                              cache.insert_batch([doc.split() for doc in docs]))
        # stricter still
        random.seed(12345)
        cache = LSHCache(b=20,r=5)
        self.assertListEqual([set(),
                              set([0]),
                              set(),
                              set([1]),
                              set(),
                              set([0,1]),
                              set([0,1,3,5]),
                              set(),
                              set([7]),
                              set(),
                              set([])],
                              cache.insert_batch([doc.split() for doc in docs]))
        # most strict
        random.seed(12345)
        cache = LSHCache(b=10,r=10)
        self.assertListEqual([set(),
                              set(),
                              set(),
                              set(),
                              set(),
                              set(),
                              set([1]),
                              set(),
                              set(),
                              set(),
                              set()],
                              cache.insert_batch([doc.split() for doc in docs]))

        # least strict
        random.seed(12345)
        cache = LSHCache(b=50,r=2,m=3)
        self.assertListEqual([set(),
                              set([0]),
                              set(),
                              set([0,1,2]),
                              set([0,2,3]),
                              set([0,1,3]),
                              set([0,1,3,5]),
                              set(),
                              set([7]),
                              set(),
                              set([9])],
                              cache.insert_batch([doc.split() for doc in docs]))