Exemplo n.º 1
0
    def testLSHArgs(self):
        lsh = LSHCache()
        self.assertEqual(20, lsh.num_bands())
        self.assertEqual(5, lsh.num_rows_per_band())
        self.assertEqual(100, lsh.num_total_rows())
        self.assertEqual(2, lsh.shingler().shingle_len())
        self.assertEqual(1, lsh.min_support())
        
        lsh = LSHCache(b=10, r=7)
        self.assertEqual(10, lsh.num_bands())
        self.assertEqual(7, lsh.num_rows_per_band())
        self.assertEqual(70, lsh.num_total_rows())
        self.assertEqual(2, lsh.shingler().shingle_len())
        self.assertEqual(1, lsh.min_support())

        lsh = LSHCache(n=70, r=7)
        self.assertEqual(10, lsh.num_bands())
        self.assertEqual(7, lsh.num_rows_per_band())
        self.assertEqual(70, lsh.num_total_rows())
        self.assertEqual(2, lsh.shingler().shingle_len())
        self.assertEqual(1, lsh.min_support())
        
        lsh = LSHCache(n=70, b=10, m=3)
        self.assertEqual(10, lsh.num_bands())
        self.assertEqual(7, lsh.num_rows_per_band())
        self.assertEqual(70, lsh.num_total_rows())
        self.assertEqual(2, lsh.shingler().shingle_len())
        self.assertEqual(3, lsh.min_support())

        lsh = LSHCache(n=70, b=10, r=7)
        self.assertEqual(10, lsh.num_bands())
        self.assertEqual(7, lsh.num_rows_per_band())
        self.assertEqual(70, lsh.num_total_rows())
        self.assertEqual(2, lsh.shingler().shingle_len())
        self.assertEqual(1, lsh.min_support())
        
        lsh = LSHCache(shingler=Shingler(5))
        self.assertEqual(20, lsh.num_bands())
        self.assertEqual(5, lsh.num_rows_per_band())
        self.assertEqual(100, lsh.num_total_rows())
        self.assertEqual(5, lsh.shingler().shingle_len())
        self.assertEqual(1, lsh.min_support())
        
        lsh = LSHCache(shingler=Shingler(2,3))
        self.assertEqual(20, lsh.num_bands())
        self.assertEqual(5, lsh.num_rows_per_band())
        self.assertEqual(100, lsh.num_total_rows())
        self.assertEqual((2,3,), lsh.shingler().shingle_len())
        self.assertEqual(1, lsh.min_support())
Exemplo n.º 2
0
    def testLSH(self):
        strings = [
                   "abcdefghijklmnopqrstuvwxyz",
                   "abcdefghijklmnopqrstuvw",
                   "defghijklmnopqrstuvw",
                   "zyxwvutsrqponmlkjihgfedcba",
                   "1abcdefghijklmnopuvw1",
                   "123456789",
                   "012345678",
                   "234567890",
                   ]
        for i, a in enumerate(strings):
            for j, b in enumerate(strings[i+1:]):
                print "'%s' (%d) <=> (%d)'%s': %f" % (a, i,j+i+1, b, 1-jaccard_distance(set(a),set(b)))

        random.seed(12345)
        lsh = LSHCache(shingler=Shingler(1))
        self.assertListEqual([set(),
                              set([0]),
                              set([0,1]),
                              set([0,1,2]),
                              set([0,1,2,3]),
                              set(),
                              set([5]),
                              set([5,6])], lsh.insert_batch(strings))
Exemplo n.º 3
0
 def testMultiLen(self):
     s = Shingler(2,3)
     shingles = set(s.shingle("abcdef"))
     self.assertSetEqual(set(map(tuple, ["ab", "bc","cd","de","ef","abc","bcd","cde","def"])),
                         shingles)
     self.assertSetEqual(set([('a','b',),('b','c',),('a','b','c',)]), set(s.shingle("abc")))
     self.assertSetEqual(set([('a','b',),(None,'a','b',)]), set(s.shingle("ab")))
     self.assertSetEqual(set([(None,'a',),(None,None,'a',)]), set(s.shingle("a")))
Exemplo n.º 4
0
def lsh_cache_from_args(args):
    seed_from_args(args)
    kwargs = {"shingler": Shingler(*args.shingle_len)}
    if args.minhash:
        kwargs['minhash'] = minhash_choices[args.minhash]
    for arg_key, kwarg_key in (('num_total', 'n'), ('num_bands',
                                                    'b'), ('num_rows', 'r'),
                               ('min_support', 'm'), ('universe_size', ) * 2):
        value = getattr(args, arg_key)
        if value:
            kwargs[kwarg_key] = value
    cache = LSHCache(**kwargs)
    # logging.info(str(cache))
    return cache
Exemplo n.º 5
0
 def testBadArgs(self):
     with self.assertRaises(AssertionError):
         Shingler(0)
     with self.assertRaises(AssertionError):
         Shingler(2,1)
Exemplo n.º 6
0
 def testLenTwo(self):
     s = Shingler(2)
     shingles = list(s.shingle("abcdef"))
     self.assertListEqual(map(tuple, ["ab", "bc", "cd","de", "ef"]), shingles)
     self.assertListEqual([(None,'a',)], list(s.shingle("a")))
     self.assertListEqual([('a','b',)], list(s.shingle("ab")))
Exemplo n.º 7
0
 def testLenOne(self):
     s = Shingler(1)
     shingles = list(s.shingle("abcdef"))
     self.assertListEqual(map(tuple, "abcdef"), shingles)
     self.assertListEqual([('a',)], list(s.shingle("a")))