def testLSH(self): strings = [ "abcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvw", "defghijklmnopqrstuvw", "zyxwvutsrqponmlkjihgfedcba", "1abcdefghijklmnopuvw1", "123456789", "012345678", "234567890", ] for i, a in enumerate(strings): for j, b in enumerate(strings[i+1:]): print "'%s' (%d) <=> (%d)'%s': %f" % (a, i,j+i+1, b, 1-jaccard_distance(set(a),set(b))) random.seed(12345) lsh = LSHCache(shingler=Shingler(1)) self.assertListEqual([set(), set([0]), set([0,1]), set([0,1,2]), set([0,1,2,3]), set(), set([5]), set([5,6])], lsh.insert_batch(strings))
def testBadArgs(self): with self.assertRaises(AssertionError): LSHCache(b=10, r=7, n=100) with self.assertRaises(AssertionError): # prime number of rows LSHCache(n=101) with self.assertRaises(AssertionError): LSHCache(n=100, r=7) with self.assertRaises(AssertionError): LSHCache(n=100, b=7)
def lsh(): lns = [ln.decode('utf-8') for ln in open('clean_data').readlines()] cache = LSHCache() docs = [] for ln in lns: word_dic = [] for wd in list(jieba.cut(ln)): # if len(wd) > 1: word_dic.append(wd) docs.append(' '.join(word_dic)) dups = {} for i, doc in enumerate(docs): dups[i] = cache.insert(doc.split(), i) for i, duplist in dups.items(): if duplist: print 'orig [%d]: %s' % (i, docs[i]) for dup in duplist: print'\tdup : [%d] %s' % (dup, docs[dup]) else: print 'no dups found for doc [%d] : %s' % (i, docs[i])
def lsh_cache_from_args(args): seed_from_args(args) kwargs = {"shingler": Shingler(*args.shingle_len)} if args.minhash: kwargs['minhash'] = minhash_choices[args.minhash] for arg_key, kwarg_key in (('num_total', 'n'), ('num_bands', 'b'), ('num_rows', 'r'), ('min_support', 'm'), ('universe_size', ) * 2): value = getattr(args, arg_key) if value: kwargs[kwarg_key] = value cache = LSHCache(**kwargs) # logging.info(str(cache)) return cache
import pprint import sys, os sys.path.insert(0, os.path.abspath('../..')) from lsh import LSHCache if __name__ == '__main__': cache = LSHCache() docs = [ "lipstick on a pig", "you can put lipstick on a pig", "you can put lipstick on a pig but it's still a pig", "you can put lipstick on a pig it's still a pig", "i think they put some lipstick on a pig but it's still a pig", "putting lipstick on a pig", "you know you can put lipstick on a pig", "they were going to send us binders full of women", "they were going to send us binders of women", "a b c d e f", "a b c d f" ] dups = {} for i, doc in enumerate(docs): dups[i] = cache.insert(doc.split(), i) for i, duplist in dups.items(): if duplist: print 'orig [%d]: %s' % (i, docs[i])
sys.path.insert(0, os.path.abspath('../..')) from lsh import LSHCache def random_int_list(start, stop, length): start, stop = (int(start), int(stop)) if start <= stop else (int(stop), int(start)) length = int(abs(length)) if length else 0 random_list = [] for i in range(length): random_list.append(random.randint(start, stop)) return random_list if __name__ == '__main__': cache = LSHCache() docs = [ "lipstick on a pig", "you can put lipstick on a pig", "you can put lipstick on a pig but it's still a pig", "you can put lipstick on a pig it's still a pig", "i think they put some lipstick on a pig but it's still a pig", "putting lipstick on a pig", "you know you can put lipstick on a pig", "they were going to send us binders full of women", "they were going to send us binders of women", "a b c d e f", "a b c d f" ] sig_mat = [] dups = {} if (0):
def testExample(self): docs = [ "lipstick on a pig", "you can put lipstick on a pig", "you may put lipstick on a pig but it's still a pig", "you can put lipstick on a pig it's still a pig", "i think they put some lipstick on a pig but it's still a pig", "putting lipstick on a pig", "you know you can put lipstick on a pig", "they were going to send us binders full of women", "they were going to send us binders of women", "a b c d e f", "a b c d f"] # least strict random.seed(12345) cache = LSHCache(b=50,r=2) self.assertListEqual([set(), set([0]), set([0,1]), set([0,1,2]), set([0,1,2,3]), set([0,1,2,3,4]), set([0,1,2,3,4,5]), set(), set([7]), set(), set([9])], cache.insert_batch([doc.split() for doc in docs])) # stricter random.seed(12345) cache = LSHCache(b=25,r=4) self.assertListEqual([set(), set([0]), set(), set([1]), set([2]), set([0,1]), set([0,1,5]), set(), set([7]), set(), set([9])], cache.insert_batch([doc.split() for doc in docs])) # stricter still random.seed(12345) cache = LSHCache(b=20,r=5) self.assertListEqual([set(), set([0]), set(), set([1]), set(), set([0,1]), set([0,1,3,5]), set(), set([7]), set(), set([])], cache.insert_batch([doc.split() for doc in docs])) # most strict random.seed(12345) cache = LSHCache(b=10,r=10) self.assertListEqual([set(), set(), set(), set(), set(), set(), set([1]), set(), set(), set(), set()], cache.insert_batch([doc.split() for doc in docs])) # least strict random.seed(12345) cache = LSHCache(b=50,r=2,m=3) self.assertListEqual([set(), set([0]), set(), set([0,1,2]), set([0,2,3]), set([0,1,3]), set([0,1,3,5]), set(), set([7]), set(), set([9])], cache.insert_batch([doc.split() for doc in docs]))
def testLSHArgs(self): lsh = LSHCache() self.assertEqual(20, lsh.num_bands()) self.assertEqual(5, lsh.num_rows_per_band()) self.assertEqual(100, lsh.num_total_rows()) self.assertEqual(2, lsh.shingler().shingle_len()) self.assertEqual(1, lsh.min_support()) lsh = LSHCache(b=10, r=7) self.assertEqual(10, lsh.num_bands()) self.assertEqual(7, lsh.num_rows_per_band()) self.assertEqual(70, lsh.num_total_rows()) self.assertEqual(2, lsh.shingler().shingle_len()) self.assertEqual(1, lsh.min_support()) lsh = LSHCache(n=70, r=7) self.assertEqual(10, lsh.num_bands()) self.assertEqual(7, lsh.num_rows_per_band()) self.assertEqual(70, lsh.num_total_rows()) self.assertEqual(2, lsh.shingler().shingle_len()) self.assertEqual(1, lsh.min_support()) lsh = LSHCache(n=70, b=10, m=3) self.assertEqual(10, lsh.num_bands()) self.assertEqual(7, lsh.num_rows_per_band()) self.assertEqual(70, lsh.num_total_rows()) self.assertEqual(2, lsh.shingler().shingle_len()) self.assertEqual(3, lsh.min_support()) lsh = LSHCache(n=70, b=10, r=7) self.assertEqual(10, lsh.num_bands()) self.assertEqual(7, lsh.num_rows_per_band()) self.assertEqual(70, lsh.num_total_rows()) self.assertEqual(2, lsh.shingler().shingle_len()) self.assertEqual(1, lsh.min_support()) lsh = LSHCache(shingler=Shingler(5)) self.assertEqual(20, lsh.num_bands()) self.assertEqual(5, lsh.num_rows_per_band()) self.assertEqual(100, lsh.num_total_rows()) self.assertEqual(5, lsh.shingler().shingle_len()) self.assertEqual(1, lsh.min_support()) lsh = LSHCache(shingler=Shingler(2,3)) self.assertEqual(20, lsh.num_bands()) self.assertEqual(5, lsh.num_rows_per_band()) self.assertEqual(100, lsh.num_total_rows()) self.assertEqual((2,3,), lsh.shingler().shingle_len()) self.assertEqual(1, lsh.min_support())
def testPercentFound(self): lsh = LSHCache(b=2,r=1) self.assertEqual(0.75, lsh.theoretical_percent_found(0.5)) self.assertAlmostEqual(0.96, lsh.theoretical_percent_found(0.8)) lsh = LSHCache(b=1,r=2) self.assertEqual(0.25, lsh.theoretical_percent_found(0.5)) self.assertAlmostEqual(0.64, lsh.theoretical_percent_found(0.8)) lsh = LSHCache(b=10,r=10) self.assertAlmostEqual(0.0097, lsh.theoretical_percent_found(0.5), places=4) self.assertAlmostEqual(0.6789, lsh.theoretical_percent_found(0.8), places=4) lsh = LSHCache(b=20,r=5) self.assertAlmostEqual(0.4701, lsh.theoretical_percent_found(0.5), places=4) self.assertAlmostEqual(0.9996, lsh.theoretical_percent_found(0.8), places=4) lsh = LSHCache(b=25,r=4) self.assertAlmostEqual(0.8008, lsh.theoretical_percent_found(0.5), places=4) self.assertAlmostEqual(1.0000, lsh.theoretical_percent_found(0.8), places=4) lsh = LSHCache(b=25,r=4,m=3) self.assertAlmostEqual(0.2032, lsh.theoretical_percent_found(0.5), places=4) self.assertAlmostEqual(0.9997, lsh.theoretical_percent_found(0.8), places=4)
def testClear(self): random.seed(12345) lsh = LSHCache() self.assertSetEqual(set(), lsh.insert("123456789")) self.assertSetEqual(set([0]), lsh.insert("34567890")) self.assertSetEqual(set([0]), lsh.insert("0123456")) self.assertSetEqual(set([0,1,2]), lsh.insert("123456789")) lsh.clear() self.assertSetEqual(set(), lsh.insert("123456789")) self.assertSetEqual(set([0]), lsh.insert("34567890")) self.assertSetEqual(set([0]), lsh.insert("0123456")) self.assertSetEqual(set([0,1,2]), lsh.insert("123456789"))