示例#1
0
 def test_bf_check(self):
     """ ensure that checking the bloom filter works """
     blm = BloomFilter(est_elements=10, false_positive_rate=0.05)
     blm.add("this is a test")
     blm.add("this is another test")
     self.assertEqual(blm.check("this is a test"), True)
     self.assertEqual(blm.check("this is another test"), True)
     self.assertEqual(blm.check("this is yet another test"), False)
     self.assertEqual(blm.check("this is not another test"), False)
示例#2
0
 def test_bf_check(self):
     ''' ensure that checking the bloom filter works '''
     blm = BloomFilter(est_elements=10, false_positive_rate=0.05)
     blm.add('this is a test')
     blm.add('this is another test')
     self.assertEqual(blm.check('this is a test'), True)
     self.assertEqual(blm.check('this is another test'), True)
     self.assertEqual(blm.check('this is yet another test'), False)
     self.assertEqual(blm.check('this is not another test'), False)
示例#3
0
class Doorkeeper:
    def __init__(self, cap=100000, false_positive=0.01):
        self.bloom = BloomFilter(cap, false_positive)

    def __insert(self, key: str):
        already_present = self.bloom.check(key)
        self.bloom.add(key)
        return already_present

    def allow(self, key: str):
        return self.__insert(key)

    def reset(self):
        self.bloom.clear()
示例#4
0
    def test_another_hashing_algo(self):
        """ test defining a completely different hashing strategy """
        md5_val = "7f590086f9b962387e145899dd001256"  # for default hash used
        filename = "test.blm"
        results = [
            14409285476674975580,
            1383622036369840193,
            10825905054403519891,
            3456253732347153957,
            1494124715262089992,
        ]

        def my_hash(key, depth, encoding="utf-8"):
            """ my hashing strategy """
            max64mod = UINT64_T_MAX + 1
            results = list()
            for i in range(0, depth):
                tmp = key[i:] + key[:i]
                val = int(hashlib.sha512(tmp.encode(encoding)).hexdigest(), 16)
                results.append(val % max64mod)
            return results

        blm = BloomFilter(est_elements=10,
                          false_positive_rate=0.05,
                          hash_function=my_hash)

        self.assertEqual(blm.elements_added, 0)
        blm.add("this is a test")
        blm.export(filename)

        md5_out = calc_file_md5(filename)
        self.assertNotEqual(md5_out, md5_val)
        os.remove(filename)

        for i in range(0, 10):
            tmp = "this is a test {0}".format(i)
            blm.add(tmp)

        self.assertEqual(blm.elements_added, 11)

        for i in range(0, 10):
            tmp = "this is a test {0}".format(i)
            self.assertTrue(blm.check(tmp))

        self.assertEqual(blm.hashes("this is a test", 5), results)
        res = blm.hashes("this is a test", 1)
        self.assertEqual(len(res), 1)
        self.assertEqual(res[0], results[0])
示例#5
0
    def test_bf_use_different_hash(self):
        """test that the different hash works as intended"""
        md5_val = "7f590086f9b962387e145899dd001256"  # for default hash used
        results = [
            14409285476674975580,
            6203976290780191624,
            5074829385518853901,
            3953072760750514173,
            11782747630324011555,
        ]

        @hash_with_depth_int
        def my_hash(key, depth=1, encoding="utf-8"):
            """my hash function"""
            max64mod = UINT64_T_MAX + 1
            val = int(hashlib.sha512(key.encode(encoding)).hexdigest(), 16)
            return val % max64mod

        blm = BloomFilter(est_elements=10,
                          false_positive_rate=0.05,
                          hash_function=my_hash)
        self.assertEqual(blm.elements_added, 0)
        blm.add("this is a test")
        with NamedTemporaryFile(dir=os.getcwd(),
                                suffix=".blm",
                                delete=DELETE_TEMP_FILES) as fobj:
            blm.export(fobj.name)

            md5_out = calc_file_md5(fobj.name)
        self.assertNotEqual(md5_out, md5_val)

        for i in range(0, 10):
            tmp = "this is a test {0}".format(i)
            blm.add(tmp)

        self.assertEqual(blm.elements_added, 11)

        for i in range(0, 10):
            tmp = "this is a test {0}".format(i)
            self.assertTrue(blm.check(tmp))

        self.assertEqual(blm.hashes("this is a test", 5), results)
        res = blm.hashes("this is a test", 1)
        self.assertEqual(len(res), 1)
        self.assertEqual(res[0], results[0])
示例#6
0
    def test_another_hashing_algo(self):
        ''' test defining a completely different hashing strategy '''
        md5_val = '7f590086f9b962387e145899dd001256'  # for default hash used
        filename = 'test.blm'
        results = [14409285476674975580,
                   1383622036369840193,
                   10825905054403519891,
                   3456253732347153957,
                   1494124715262089992]

        def my_hash(key, depth, encoding='utf-8'):
            ''' my hashing strategy '''
            max64mod = UINT64_T_MAX + 1
            results = list()
            for i in range(0, depth):
                tmp = key[i:] + key[:i]
                val = int(hashlib.sha512(tmp.encode(encoding)).hexdigest(), 16)
                results.append(val % max64mod)
            return results

        blm = BloomFilter(est_elements=10, false_positive_rate=0.05,
                          hash_function=my_hash)

        self.assertEqual(blm.elements_added, 0)
        blm.add('this is a test')
        blm.export(filename)

        md5_out = calc_file_md5(filename)
        self.assertNotEqual(md5_out, md5_val)
        os.remove(filename)

        for i in range(0, 10):
            tmp = 'this is a test {0}'.format(i)
            blm.add(tmp)

        self.assertEqual(blm.elements_added, 11)

        for i in range(0, 10):
            tmp = 'this is a test {0}'.format(i)
            self.assertTrue(blm.check(tmp))

        self.assertEqual(blm.hashes('this is a test', 5), results)
        res = blm.hashes('this is a test', 1)
        self.assertEqual(len(res), 1)
        self.assertEqual(res[0], results[0])
示例#7
0
    def test_bf_use_different_hash(self):
        ''' test that the different hash works as intended '''
        md5_val = '7f590086f9b962387e145899dd001256'  # for default hash used
        filename = 'test.blm'
        results = [
            14409285476674975580, 6203976290780191624, 5074829385518853901,
            3953072760750514173, 11782747630324011555
        ]

        @hash_with_depth_int
        def my_hash(key, encoding='utf-8'):
            ''' my hash function '''
            max64mod = UINT64_T_MAX + 1
            val = int(hashlib.sha512(key.encode(encoding)).hexdigest(), 16)
            return val % max64mod

        blm = BloomFilter(est_elements=10,
                          false_positive_rate=0.05,
                          hash_function=my_hash)
        self.assertEqual(blm.elements_added, 0)
        blm.add('this is a test')
        blm.export(filename)

        md5_out = calc_file_md5(filename)
        self.assertNotEqual(md5_out, md5_val)
        os.remove(filename)

        for i in range(0, 10):
            tmp = 'this is a test {0}'.format(i)
            blm.add(tmp)

        self.assertEqual(blm.elements_added, 11)

        for i in range(0, 10):
            tmp = 'this is a test {0}'.format(i)
            self.assertTrue(blm.check(tmp))

        self.assertEqual(blm.hashes('this is a test', 5), results)
        res = blm.hashes('this is a test', 1)
        self.assertEqual(len(res), 1)
        self.assertEqual(res[0], results[0])
示例#8
0
    def test_bf_use_different_hash(self):
        ''' test that the different hash works as intended '''
        md5_val = '7f590086f9b962387e145899dd001256'  # for default hash used
        filename = 'test.blm'
        results = [14409285476674975580,
                   6203976290780191624,
                   5074829385518853901,
                   3953072760750514173,
                   11782747630324011555]

        @hash_with_depth_int
        def my_hash(key, encoding='utf-8'):
            ''' my hash function '''
            max64mod = UINT64_T_MAX + 1
            val = int(hashlib.sha512(key.encode(encoding)).hexdigest(), 16)
            return val % max64mod

        blm = BloomFilter(est_elements=10, false_positive_rate=0.05,
                          hash_function=my_hash)
        self.assertEqual(blm.elements_added, 0)
        blm.add('this is a test')
        blm.export(filename)

        md5_out = calc_file_md5(filename)
        self.assertNotEqual(md5_out, md5_val)
        os.remove(filename)

        for i in range(0, 10):
            tmp = 'this is a test {0}'.format(i)
            blm.add(tmp)

        self.assertEqual(blm.elements_added, 11)

        for i in range(0, 10):
            tmp = 'this is a test {0}'.format(i)
            self.assertTrue(blm.check(tmp))

        self.assertEqual(blm.hashes('this is a test', 5), results)
        res = blm.hashes('this is a test', 1)
        self.assertEqual(len(res), 1)
        self.assertEqual(res[0], results[0])
示例#9
0
#%%Bloom filters

ratings2 = ratings[:-4500000]

bf = BloomFilter(est_elements=1000, false_positive_rate=0.05)

for index, row in ratings2.iterrows():
    bf.add(str(row['movieId']))
print("\n Memory size of Bloomfilter list in bytes= ", len(pickle.dumps(bf)))
print("\n Memory size of Series stracture list in bytes= ",
      len(pickle.dumps(ratings2.movieId)))

bf_checklist = list()

for index, row in ratings2.iterrows():
    bf_checklist.append(str(bf.check(str(row['movieId']))))
res = [k for k in bf_checklist if str(False) in k]
print("Actual movies missed: ", len(res))  #false negatives

bf_checklist2 = list()

for i in list(string.ascii_lowercase):
    bf_checklist.append(str(i in bf))
res = [k for k in bf_checklist2 if str(True) in k]
print("Number of false positives: ", len(res))  #false positives
#%% Count unique users and movies

ratings = ratings.iloc[:, 0:2]
users = list()
movies = list()