示例#1
0
 def test_contain(self):
     k = 3
     m = 128
     bloom = BloomFilter(m, k)
     self.assertFalse(bloom.test('#'))
     bloom.add("#")
     self.assertTrue(bloom.test('#'))
示例#2
0
class LocalBloomFilter():
    def __init__(self, capacity, error, prime_length=True):
        self.bf = BloomFilter(capacity, error, prime_length)
        self.bitmap = bytes(int(self.bf.bits / 8) + 1)

    def add(self, data):
        if isinstance(data, (list, tuple)):
            for v in data:
                assert isinstance(
                    v, str), 'add() arg must be a str or list/tuple of strings'
                self.bf.add(self.bitmap, v)
        else:
            assert isinstance(
                data, str), 'add() arg must be a str or list/tuple of strings'
            self.bf.add(self.bitmap, data)

    def is_contain(self, data):
        if isinstance(data, (list, tuple)):
            for v in data:
                assert isinstance(
                    v, str
                ), 'is_contain() arg must be a str or list/tuple of strings'
            return [self.bf.is_contain(self.bitmap, v) for v in data]
        else:
            assert isinstance(
                data,
                str), 'is_contain() arg must be a str or list/tuple of strings'
            return self.bf.is_contain(self.bitmap, data)

    def clean(self):
        self.bf.clean_bitmap(self.bitmap)
示例#3
0
def url_test(positives, negatives, fp_rate):
    bf = BloomFilter(len(positives), fp_rate, string_digest)
    for pos in positives:
        bf.add(pos)
        assert (bf.check(pos))
    print("Bits needed", bf.size)
    print("Hash functions needed", bf.hash_count)

    fp = 0.0
    for neg in negatives:
        if bf.check(neg):
            fp += 1
    print("False positives", fp / len(negatives))
示例#4
0
def simple_test():
    bf = BloomFilter(1000, .001, get_digest)
    for i in range(1000):
        bf.add(i)
        if not bf.check(i):
            print("False Negative!")

    count = 0.0
    fp = 0.0
    for i in range(1001, 10000):
        if bf.check(i):
            fp += 1
        count += 1

    print("False Positive Rate: " + str(fp / count))
示例#5
0
def string_test():
    bf = BloomFilter(1000, .001, string_digest)
    for i in range(1000):
        random_string = generate_random_string(i)
        bf.add(random_string)
        assert (bf.check(random_string))

    count = 0.0
    fp = 0.0
    for i in range(1001, 10000):
        random_string = generate_random_string(i)
        if bf.check(random_string):
            fp += 1
        count += 1

    print("False Positive Rate: " + str(fp / count))
示例#6
0
def read_write_test(word, items_count, fp_rpob, hash_cnt):
    """
    test single write and read test
    """
    print(
        f'==============================================================================================================='
    )
    print(f'single write and read test')
    print(
        f'==============================================================================================================='
    )
    # eg : items_count = 1000, fp_prob = 0.01, hash_cnt=3,
    bloom_filter = BloomFilter(items_count, fp_rpob, hash_cnt)
    bloom_filter.add(word)
    print(f'add data: {word}')
    print(f'look up data: {word} = {bloom_filter.may_match(word)}')
示例#7
0
class DeepBloom(object):
    def __init__(self, model, data, fp_rate):
        self.model = model
        self.threshold = None
        self.fp_rate = float(fp_rate)
        self.fit(data)
        self.create_bloom_filter(data)

    def check(self, item):
        if self.model.predict(item) > self.threshold:
            return True
        return self.bloom_filter.check(item)

    def create_bloom_filter(self, data):
        print("Creating bloom filter")
        false_negatives = []
        preds = self.model.predicts(data.positives)
        for i in range(len(data.positives)):
            if preds[i] <= self.threshold:
                false_negatives.append(data.positives[i])
        print("Number of false negatives at bloom time", len(false_negatives))
        self.bloom_filter = BloomFilter(len(false_negatives), self.fp_rate / 2,
                                        string_digest)
        for fn in false_negatives:
            self.bloom_filter.add(fn)
        print("Created bloom filter")

    def fit(self, data):
        ## Split negative data into subgroups.
        (s1, s2) = split_negatives(data)
        print("Training model with train, dev, positives", len(s1), len(s2),
              len(data.positives))

        ## Shuffle together subset of negatives and positives.
        ## Then, train the model on this data.
        shuffled = shuffle_for_training(s1, data.positives)
        self.model.fit(shuffled[0], shuffled[1])
        print("Done fitting")

        ## We want a threshold such that at most s2.size * fp_rate/2 elements
        ## are greater than threshold.
        fp_index = math.ceil((len(s2) * (1 - self.fp_rate / 2)))
        predictions = self.model.predicts(s2)
        predictions.sort()
        self.threshold = predictions[fp_index]
示例#8
0
def test_bloom_filter():
    bloomfilter = BloomFilter(NUM_KEYS, FALSE_POSITIVE_PROBABILITY)
    word_present = ['abound', 'abounds', 'abundance', 'abundant', 'accessable',
                    'bloom', 'blossom', 'bolster', 'bonny', 'bonus', 'bonuses',
                    'coherent', 'cohesive', 'colorful', 'comely', 'comfort',
                    'gems', 'generosity', 'generous', 'generously', 'genial']

    word_absent = ['facebook', 'twitter']

    for item in word_present:
        bloomfilter.add(item)

    test_words = word_present[:10] + word_absent
    shuffle(test_words)
    for word in test_words:
        if bloomfilter.is_member(word):
            if word in word_absent:
                print(f"'{word}' is a false positive!")
            else:
                print(f"'{word}' is probably present!")
        else:
            print(f"'{word}' is definitely not present!")
示例#9
0
def makeFilterloadPayload(publicKeyHash, scriptHash, transactionID):
    filter = BloomFilter(3, 0.01, 0, 1)
    filter.add(publicKeyHash)
    filter.add(scriptHash)
    filter.add(transactionID)
    print(len(filter.bit_array))
    payload = filter.serialize()
    return payload
示例#10
0
# words to be added
word_present = [
    'abound', 'abounds', 'abundance', 'abundant', 'accessable', 'bloom',
    'blossom', 'bolster', 'bonny', 'bonus', 'bonuses', 'coherent', 'cohesive',
    'colorful', 'comely', 'comfort', 'gems', 'generosity', 'generous',
    'generously', 'genial'
]

# word not added
word_absent = [
    'bluff', 'cheater', 'hate', 'war', 'humanity', 'racism', 'hurt', 'nuke',
    'gloomy', 'facebook', 'geeksforgeeks', 'twitter'
]

for item in word_present:
    bloomf.add(item)

shuffle(word_present)
shuffle(word_absent)

test_words = word_present[:10] + word_absent

for word in test_words:
    if bloomf.test(word):
        if word in word_absent:
            print("'{}' est un faux positif !".format(word))
        else:
            print("'{}' est probablement présent !".format(word))
    else:
        print("'{}' n'est définitivement pas présent !".format(word))
        assert word in word_absent
from BloomFilter import BloomFilter

bf = BloomFilter()

bf.add("python")
bf.add("vk")

assert(bf.check("python") == True)
assert(bf.check("vk") == True)
assert(bf.check("kaboom") == False)
示例#12
0
    output_file = open(sys.argv[2], "a+")
    p = float(sys.argv[3])

except IOError:
    print "Invalid input arguments! Check that the input file exists!"
    exit(0)
except ValueError as e:
    print "Third argument not a float number!"
    exit(0)
lines = input_file.readlines()
bloom.initialize_using_np(len(lines), float(sys.argv[3]))

start_time = time.time()
for line in lines:
    line = line.strip().rstrip()
    bloom.add(line)

input_file.close()

print "Time needed to fill filter: ", time.time() - start_time, "s"
#print "Memory occupied by bloom filter ",resource.getrusage(resource.RUSAGE_SELF).ru_maxrss, "bytes"
while (True):
    inputed = raw_input("Enter key to check if it is in filter: ")
    if (bloom.query(inputed)):
        print "Possible in filter"
        output_file.write("Key " + inputed + " possible in filter\n")
    else:
        print "Not in filter"
        output_file.write("Key " + inputed + " not in filter\n")
output_file.close()
from BloomFilter import BloomFilter

bf = BloomFilter()

bf.add("python")
bf.add("vk")

assert (bf.check("python") == True)
assert (bf.check("vk") == True)
assert (bf.check("kaboom") == False)
示例#14
0
from BloomFilter import BloomFilter

if __name__ == '__main__':  #Some tests
    import random
    import time

    N = 10000000
    BF = BloomFilter(N, 7)

    kmers = [
        ''.join([random.choice('ACGT') for _ in range(15)])
        for _ in range(500000)
    ]
    kmers_to_add = kmers[:len(kmers) - 1000]
    kmers_to_test = kmers[len(kmers_to_add):]

    for kmer in kmers_to_add:
        BF.add(kmer)

    false_neg = 0
    for kmer in kmers_to_add:
        if not kmer in BF: false_neg += 1
    assert false_neg == 0

    false_pos = 0
    for kmer in kmers_to_test:
        if kmer in BF: false_pos += 1
    print(f'FP = {false_pos}')
示例#15
0
文件: Start.py 项目: hrcp/bloomfilter
    output_file=open(sys.argv[2],"a+")
    p=float(sys.argv[3])

except IOError:
    print "Invalid input arguments! Check that the input file exists!"
    exit(0)
except ValueError as e:
    print "Third argument not a float number!"
    exit(0)
lines=input_file.readlines()
bloom.initialize_using_np(len(lines),float(sys.argv[3]))

start_time=time.time()
for line in lines:
    line=line.strip().rstrip()
    bloom.add(line)

input_file.close()

print "Time needed to fill filter: ", time.time() - start_time,"s"
#print "Memory occupied by bloom filter ",resource.getrusage(resource.RUSAGE_SELF).ru_maxrss, "bytes"
while(True):
    inputed=raw_input("Enter key to check if it is in filter: ")
    if(bloom.query(inputed)):
        print "Possible in filter"
        output_file.write("Key "+inputed+" possible in filter\n")
    else:
        print "Not in filter"
        output_file.write("Key "+inputed+" not in filter\n")
output_file.close()
示例#16
0
            try:
                #save(root_url + list[i].get("href"),i)
                url = list[i].get("href")
                if url.startswith("ftp://"):
                    print >> logsfile,url.encode('utf-8')
                    logsfile.flush()
                elif(url.find("http://www.dy2018.com") == -1):
                    url = root_url + list[i].get("href")
                if not bloom.__contains__(url):
                    #print(str(i)+"------num---"+str(num-1)+"---------"+url)
                    bloom.add(url)
                    foreach(url,num)
            except Exception,e:
                continue
                #print >> logsfile,("foreach error")
    except Exception,e:
        return
        #print >> logsfile,e.message

def save(url,title):
    print(str(title)+"=================="+url)
if __name__ == '__main__':
    maxnum = 4 
    logsfile = open('./urls.log', 'a+')
    bloom = BloomFilter(160000,1000)
    bloom.add('http://www.dy2018.com/index.html')
    bloom.add('http://www.dy2018.com/')
    root_url = 'http://www.dy2018.com'
    foreach('http://www.dy2018.com/index.html',1)
    logsfile.close()
示例#17
0
class DeeperBloom(object):
    '''
    fp_fractions is an array of size self.k + 1 containing the fraction of the
    fp_rate dedicated to each of k models, then bloom filter, in that order. If not passed,
    default to splitting false positives evenly.
    '''
    def __init__(self, models, data, fp_rate, fp_fractions=None):
        self.models = models
        self.k = len(self.models)
        self.thresholds = [None] * self.k
        if fp_fractions is None:
            self.fp_rate_bloom = float(fp_rate) / (self.k + 1)
            self.fp_rates = [float(fp_rate) / (self.k + 1)] * self.k
        else:
            self.fp_rate_bloom = fp_fractions[self.k] * fp_rate
            self.fp_rates = []
            for i in range(self.k):
                self.fp_rates.append(fp_fractions[i] * fp_rate)
        self.fit(data)
        self.create_bloom_filter(data)

    def check(self, item):
        for i in range(self.k):
            if self.models[i].predict(item) > self.thresholds[i]:
                return True
        return self.bloom_filter.check(item)

    def create_bloom_filter(self, data):
        print("Creating bloom filter")
        false_negatives = []
        preds = []
        for i in range(self.k):
            preds.append(self.models[i].predicts(data.positives))
        for j in range(len(data.positives)):
            is_false = True
            for i in range(self.k):
                pred = preds[i][j]
                if pred > self.thresholds[i]:
                    is_false = False
            if is_false:      
                false_negatives.append(data.positives[j])
        print("Number of false negatives at bloom time", len(false_negatives))
        print("Effective bloom filter false positive rate", self.fp_rate_bloom)
        self.bloom_filter = BloomFilter(
            len(false_negatives),
            self.fp_rate_bloom,
            string_digest
        )
        for fn in false_negatives:
            self.bloom_filter.add(fn)
        print("Created bloom filter")

    def fit(self, data):
        ## Split negative data into subgroups.

        for i in range(self.k):
            # First prep s1, s2 and curr_positives
            print("Data prep", i)
            if i == 0:
                (s1, s2) = split_negatives(data)
                curr_positives = data.positives
                false_negatives = curr_positives
            else:
                # TODO BALANCE
                # TODO FIX curr_positives not carrying through all false negatives
                # TODO add back difficulty factor stuff?
                # DIFFICULTY_FACTOR = 1.2
                # Get false negatives from curr_positives, with
                # respect to prev model
                new_false_negatives = []
                new_positives = []
                preds = self.models[i - 1].predicts(false_negatives)
                for j in range(len(false_negatives)):
                    pred = preds[j]
                    if pred <= self.thresholds[i - 1]:
                        new_false_negatives.append(false_negatives[j])
                        if pred <= self.thresholds[i - 1]:
                            new_positives.append(false_negatives[j])
                curr_positives = new_positives
                false_negatives = new_false_negatives

                # Get true negatives from s1, with respect to prev
                # model
                new_s1 = []
                preds = self.models[i - 1].predicts(s1)
                for j in range(len(s1)):
                    pred = preds[j]
                    if pred <= self.thresholds[i - 1]:
                        new_s1.append(s1[j])
                s1 = new_s1

                # Get true negatives from s2, with respect to prev
                # model
                new_s2 = []
                preds = self.models[i - 1].predicts(s2)
                for j in range(len(s2)):
                    pred = preds[j]
                    if pred <= self.thresholds[i - 1]:
                        new_s2.append(s2[j])
                s2 = new_s2

                # Ensure that s1 is balanced relative to curr_positives
                # if (len(s1) > len(curr_positives)):
                #     s1 = s1[:len(curr_positives)]
            print("Number of false negatives at this step", len(false_negatives))
            print("Training model with train, dev, positives", i, len(s1), len(s2), len(curr_positives))
            random.shuffle(s1)
            random.shuffle(s2)
            random.shuffle(curr_positives)

            ## Shuffle together subset of negatives and positives.
            ## Then, train the model on this data.
            shuffled = shuffle_for_training(s1, curr_positives)
            self.models[i].fit(shuffled[0], shuffled[1])
            print("Done fitting")

            ## We want a threshold such that at most s2.size * fp_rates[i] elements
            ## are greater than threshold.
            print("Using effective false positive rate for model", i, self.fp_rates[i])
            fp_index = math.ceil((len(s2) * (1 - self.fp_rates[i])))
            predictions = self.models[i].predicts(s2)
            predictions.sort()
            self.thresholds[i] = predictions[fp_index]
            print("Threshold for model", i, self.thresholds[i])
示例#18
0
 def test_add(self):
     k = 3
     m = 128
     bloom = BloomFilter(m, k)
     bloom.add("#")
     self.assertIn(bloom.bits.count(), range(1, 4))