def test_contain(self): k = 3 m = 128 bloom = BloomFilter(m, k) self.assertFalse(bloom.test('#')) bloom.add("#") self.assertTrue(bloom.test('#'))
class LocalBloomFilter(): def __init__(self, capacity, error, prime_length=True): self.bf = BloomFilter(capacity, error, prime_length) self.bitmap = bytes(int(self.bf.bits / 8) + 1) def add(self, data): if isinstance(data, (list, tuple)): for v in data: assert isinstance( v, str), 'add() arg must be a str or list/tuple of strings' self.bf.add(self.bitmap, v) else: assert isinstance( data, str), 'add() arg must be a str or list/tuple of strings' self.bf.add(self.bitmap, data) def is_contain(self, data): if isinstance(data, (list, tuple)): for v in data: assert isinstance( v, str ), 'is_contain() arg must be a str or list/tuple of strings' return [self.bf.is_contain(self.bitmap, v) for v in data] else: assert isinstance( data, str), 'is_contain() arg must be a str or list/tuple of strings' return self.bf.is_contain(self.bitmap, data) def clean(self): self.bf.clean_bitmap(self.bitmap)
def url_test(positives, negatives, fp_rate): bf = BloomFilter(len(positives), fp_rate, string_digest) for pos in positives: bf.add(pos) assert (bf.check(pos)) print("Bits needed", bf.size) print("Hash functions needed", bf.hash_count) fp = 0.0 for neg in negatives: if bf.check(neg): fp += 1 print("False positives", fp / len(negatives))
def simple_test(): bf = BloomFilter(1000, .001, get_digest) for i in range(1000): bf.add(i) if not bf.check(i): print("False Negative!") count = 0.0 fp = 0.0 for i in range(1001, 10000): if bf.check(i): fp += 1 count += 1 print("False Positive Rate: " + str(fp / count))
def string_test(): bf = BloomFilter(1000, .001, string_digest) for i in range(1000): random_string = generate_random_string(i) bf.add(random_string) assert (bf.check(random_string)) count = 0.0 fp = 0.0 for i in range(1001, 10000): random_string = generate_random_string(i) if bf.check(random_string): fp += 1 count += 1 print("False Positive Rate: " + str(fp / count))
def read_write_test(word, items_count, fp_rpob, hash_cnt): """ test single write and read test """ print( f'===============================================================================================================' ) print(f'single write and read test') print( f'===============================================================================================================' ) # eg : items_count = 1000, fp_prob = 0.01, hash_cnt=3, bloom_filter = BloomFilter(items_count, fp_rpob, hash_cnt) bloom_filter.add(word) print(f'add data: {word}') print(f'look up data: {word} = {bloom_filter.may_match(word)}')
class DeepBloom(object): def __init__(self, model, data, fp_rate): self.model = model self.threshold = None self.fp_rate = float(fp_rate) self.fit(data) self.create_bloom_filter(data) def check(self, item): if self.model.predict(item) > self.threshold: return True return self.bloom_filter.check(item) def create_bloom_filter(self, data): print("Creating bloom filter") false_negatives = [] preds = self.model.predicts(data.positives) for i in range(len(data.positives)): if preds[i] <= self.threshold: false_negatives.append(data.positives[i]) print("Number of false negatives at bloom time", len(false_negatives)) self.bloom_filter = BloomFilter(len(false_negatives), self.fp_rate / 2, string_digest) for fn in false_negatives: self.bloom_filter.add(fn) print("Created bloom filter") def fit(self, data): ## Split negative data into subgroups. (s1, s2) = split_negatives(data) print("Training model with train, dev, positives", len(s1), len(s2), len(data.positives)) ## Shuffle together subset of negatives and positives. ## Then, train the model on this data. shuffled = shuffle_for_training(s1, data.positives) self.model.fit(shuffled[0], shuffled[1]) print("Done fitting") ## We want a threshold such that at most s2.size * fp_rate/2 elements ## are greater than threshold. fp_index = math.ceil((len(s2) * (1 - self.fp_rate / 2))) predictions = self.model.predicts(s2) predictions.sort() self.threshold = predictions[fp_index]
def test_bloom_filter(): bloomfilter = BloomFilter(NUM_KEYS, FALSE_POSITIVE_PROBABILITY) word_present = ['abound', 'abounds', 'abundance', 'abundant', 'accessable', 'bloom', 'blossom', 'bolster', 'bonny', 'bonus', 'bonuses', 'coherent', 'cohesive', 'colorful', 'comely', 'comfort', 'gems', 'generosity', 'generous', 'generously', 'genial'] word_absent = ['facebook', 'twitter'] for item in word_present: bloomfilter.add(item) test_words = word_present[:10] + word_absent shuffle(test_words) for word in test_words: if bloomfilter.is_member(word): if word in word_absent: print(f"'{word}' is a false positive!") else: print(f"'{word}' is probably present!") else: print(f"'{word}' is definitely not present!")
def makeFilterloadPayload(publicKeyHash, scriptHash, transactionID): filter = BloomFilter(3, 0.01, 0, 1) filter.add(publicKeyHash) filter.add(scriptHash) filter.add(transactionID) print(len(filter.bit_array)) payload = filter.serialize() return payload
# words to be added word_present = [ 'abound', 'abounds', 'abundance', 'abundant', 'accessable', 'bloom', 'blossom', 'bolster', 'bonny', 'bonus', 'bonuses', 'coherent', 'cohesive', 'colorful', 'comely', 'comfort', 'gems', 'generosity', 'generous', 'generously', 'genial' ] # word not added word_absent = [ 'bluff', 'cheater', 'hate', 'war', 'humanity', 'racism', 'hurt', 'nuke', 'gloomy', 'facebook', 'geeksforgeeks', 'twitter' ] for item in word_present: bloomf.add(item) shuffle(word_present) shuffle(word_absent) test_words = word_present[:10] + word_absent for word in test_words: if bloomf.test(word): if word in word_absent: print("'{}' est un faux positif !".format(word)) else: print("'{}' est probablement présent !".format(word)) else: print("'{}' n'est définitivement pas présent !".format(word)) assert word in word_absent
from BloomFilter import BloomFilter bf = BloomFilter() bf.add("python") bf.add("vk") assert(bf.check("python") == True) assert(bf.check("vk") == True) assert(bf.check("kaboom") == False)
output_file = open(sys.argv[2], "a+") p = float(sys.argv[3]) except IOError: print "Invalid input arguments! Check that the input file exists!" exit(0) except ValueError as e: print "Third argument not a float number!" exit(0) lines = input_file.readlines() bloom.initialize_using_np(len(lines), float(sys.argv[3])) start_time = time.time() for line in lines: line = line.strip().rstrip() bloom.add(line) input_file.close() print "Time needed to fill filter: ", time.time() - start_time, "s" #print "Memory occupied by bloom filter ",resource.getrusage(resource.RUSAGE_SELF).ru_maxrss, "bytes" while (True): inputed = raw_input("Enter key to check if it is in filter: ") if (bloom.query(inputed)): print "Possible in filter" output_file.write("Key " + inputed + " possible in filter\n") else: print "Not in filter" output_file.write("Key " + inputed + " not in filter\n") output_file.close()
from BloomFilter import BloomFilter bf = BloomFilter() bf.add("python") bf.add("vk") assert (bf.check("python") == True) assert (bf.check("vk") == True) assert (bf.check("kaboom") == False)
from BloomFilter import BloomFilter if __name__ == '__main__': #Some tests import random import time N = 10000000 BF = BloomFilter(N, 7) kmers = [ ''.join([random.choice('ACGT') for _ in range(15)]) for _ in range(500000) ] kmers_to_add = kmers[:len(kmers) - 1000] kmers_to_test = kmers[len(kmers_to_add):] for kmer in kmers_to_add: BF.add(kmer) false_neg = 0 for kmer in kmers_to_add: if not kmer in BF: false_neg += 1 assert false_neg == 0 false_pos = 0 for kmer in kmers_to_test: if kmer in BF: false_pos += 1 print(f'FP = {false_pos}')
output_file=open(sys.argv[2],"a+") p=float(sys.argv[3]) except IOError: print "Invalid input arguments! Check that the input file exists!" exit(0) except ValueError as e: print "Third argument not a float number!" exit(0) lines=input_file.readlines() bloom.initialize_using_np(len(lines),float(sys.argv[3])) start_time=time.time() for line in lines: line=line.strip().rstrip() bloom.add(line) input_file.close() print "Time needed to fill filter: ", time.time() - start_time,"s" #print "Memory occupied by bloom filter ",resource.getrusage(resource.RUSAGE_SELF).ru_maxrss, "bytes" while(True): inputed=raw_input("Enter key to check if it is in filter: ") if(bloom.query(inputed)): print "Possible in filter" output_file.write("Key "+inputed+" possible in filter\n") else: print "Not in filter" output_file.write("Key "+inputed+" not in filter\n") output_file.close()
try: #save(root_url + list[i].get("href"),i) url = list[i].get("href") if url.startswith("ftp://"): print >> logsfile,url.encode('utf-8') logsfile.flush() elif(url.find("http://www.dy2018.com") == -1): url = root_url + list[i].get("href") if not bloom.__contains__(url): #print(str(i)+"------num---"+str(num-1)+"---------"+url) bloom.add(url) foreach(url,num) except Exception,e: continue #print >> logsfile,("foreach error") except Exception,e: return #print >> logsfile,e.message def save(url,title): print(str(title)+"=================="+url) if __name__ == '__main__': maxnum = 4 logsfile = open('./urls.log', 'a+') bloom = BloomFilter(160000,1000) bloom.add('http://www.dy2018.com/index.html') bloom.add('http://www.dy2018.com/') root_url = 'http://www.dy2018.com' foreach('http://www.dy2018.com/index.html',1) logsfile.close()
class DeeperBloom(object): ''' fp_fractions is an array of size self.k + 1 containing the fraction of the fp_rate dedicated to each of k models, then bloom filter, in that order. If not passed, default to splitting false positives evenly. ''' def __init__(self, models, data, fp_rate, fp_fractions=None): self.models = models self.k = len(self.models) self.thresholds = [None] * self.k if fp_fractions is None: self.fp_rate_bloom = float(fp_rate) / (self.k + 1) self.fp_rates = [float(fp_rate) / (self.k + 1)] * self.k else: self.fp_rate_bloom = fp_fractions[self.k] * fp_rate self.fp_rates = [] for i in range(self.k): self.fp_rates.append(fp_fractions[i] * fp_rate) self.fit(data) self.create_bloom_filter(data) def check(self, item): for i in range(self.k): if self.models[i].predict(item) > self.thresholds[i]: return True return self.bloom_filter.check(item) def create_bloom_filter(self, data): print("Creating bloom filter") false_negatives = [] preds = [] for i in range(self.k): preds.append(self.models[i].predicts(data.positives)) for j in range(len(data.positives)): is_false = True for i in range(self.k): pred = preds[i][j] if pred > self.thresholds[i]: is_false = False if is_false: false_negatives.append(data.positives[j]) print("Number of false negatives at bloom time", len(false_negatives)) print("Effective bloom filter false positive rate", self.fp_rate_bloom) self.bloom_filter = BloomFilter( len(false_negatives), self.fp_rate_bloom, string_digest ) for fn in false_negatives: self.bloom_filter.add(fn) print("Created bloom filter") def fit(self, data): ## Split negative data into subgroups. for i in range(self.k): # First prep s1, s2 and curr_positives print("Data prep", i) if i == 0: (s1, s2) = split_negatives(data) curr_positives = data.positives false_negatives = curr_positives else: # TODO BALANCE # TODO FIX curr_positives not carrying through all false negatives # TODO add back difficulty factor stuff? # DIFFICULTY_FACTOR = 1.2 # Get false negatives from curr_positives, with # respect to prev model new_false_negatives = [] new_positives = [] preds = self.models[i - 1].predicts(false_negatives) for j in range(len(false_negatives)): pred = preds[j] if pred <= self.thresholds[i - 1]: new_false_negatives.append(false_negatives[j]) if pred <= self.thresholds[i - 1]: new_positives.append(false_negatives[j]) curr_positives = new_positives false_negatives = new_false_negatives # Get true negatives from s1, with respect to prev # model new_s1 = [] preds = self.models[i - 1].predicts(s1) for j in range(len(s1)): pred = preds[j] if pred <= self.thresholds[i - 1]: new_s1.append(s1[j]) s1 = new_s1 # Get true negatives from s2, with respect to prev # model new_s2 = [] preds = self.models[i - 1].predicts(s2) for j in range(len(s2)): pred = preds[j] if pred <= self.thresholds[i - 1]: new_s2.append(s2[j]) s2 = new_s2 # Ensure that s1 is balanced relative to curr_positives # if (len(s1) > len(curr_positives)): # s1 = s1[:len(curr_positives)] print("Number of false negatives at this step", len(false_negatives)) print("Training model with train, dev, positives", i, len(s1), len(s2), len(curr_positives)) random.shuffle(s1) random.shuffle(s2) random.shuffle(curr_positives) ## Shuffle together subset of negatives and positives. ## Then, train the model on this data. shuffled = shuffle_for_training(s1, curr_positives) self.models[i].fit(shuffled[0], shuffled[1]) print("Done fitting") ## We want a threshold such that at most s2.size * fp_rates[i] elements ## are greater than threshold. print("Using effective false positive rate for model", i, self.fp_rates[i]) fp_index = math.ceil((len(s2) * (1 - self.fp_rates[i]))) predictions = self.models[i].predicts(s2) predictions.sort() self.thresholds[i] = predictions[fp_index] print("Threshold for model", i, self.thresholds[i])
def test_add(self): k = 3 m = 128 bloom = BloomFilter(m, k) bloom.add("#") self.assertIn(bloom.bits.count(), range(1, 4))