def testLSH(self): strings = [ "abcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvw", "defghijklmnopqrstuvw", "zyxwvutsrqponmlkjihgfedcba", "1abcdefghijklmnopuvw1", "123456789", "012345678", "234567890", ] for i, a in enumerate(strings): for j, b in enumerate(strings[i+1:]): print "'%s' (%d) <=> (%d)'%s': %f" % (a, i,j+i+1, b, 1-jaccard_distance(set(a),set(b))) random.seed(12345) lsh = LSHCache(shingler=Shingler(1)) self.assertListEqual([set(), set([0]), set([0,1]), set([0,1,2]), set([0,1,2,3]), set(), set([5]), set([5,6])], lsh.insert_batch(strings))
def testExample(self): docs = [ "lipstick on a pig", "you can put lipstick on a pig", "you may put lipstick on a pig but it's still a pig", "you can put lipstick on a pig it's still a pig", "i think they put some lipstick on a pig but it's still a pig", "putting lipstick on a pig", "you know you can put lipstick on a pig", "they were going to send us binders full of women", "they were going to send us binders of women", "a b c d e f", "a b c d f"] # least strict random.seed(12345) cache = LSHCache(b=50,r=2) self.assertListEqual([set(), set([0]), set([0,1]), set([0,1,2]), set([0,1,2,3]), set([0,1,2,3,4]), set([0,1,2,3,4,5]), set(), set([7]), set(), set([9])], cache.insert_batch([doc.split() for doc in docs])) # stricter random.seed(12345) cache = LSHCache(b=25,r=4) self.assertListEqual([set(), set([0]), set(), set([1]), set([2]), set([0,1]), set([0,1,5]), set(), set([7]), set(), set([9])], cache.insert_batch([doc.split() for doc in docs])) # stricter still random.seed(12345) cache = LSHCache(b=20,r=5) self.assertListEqual([set(), set([0]), set(), set([1]), set(), set([0,1]), set([0,1,3,5]), set(), set([7]), set(), set([])], cache.insert_batch([doc.split() for doc in docs])) # most strict random.seed(12345) cache = LSHCache(b=10,r=10) self.assertListEqual([set(), set(), set(), set(), set(), set(), set([1]), set(), set(), set(), set()], cache.insert_batch([doc.split() for doc in docs])) # least strict random.seed(12345) cache = LSHCache(b=50,r=2,m=3) self.assertListEqual([set(), set([0]), set(), set([0,1,2]), set([0,2,3]), set([0,1,3]), set([0,1,3,5]), set(), set([7]), set(), set([9])], cache.insert_batch([doc.split() for doc in docs]))