Пример #1
0
def test_bloom_filter():
    bf = bloom.BloomFilter(bits=20, hashes=3)
    bf.add('hello')
    assert bf.contains('hello'), 'BloomFilter failed to add item "hello"'
    assert not bf.contains('hi'), 'BloomFilter failed to deny item "hi"'
    bf.add('hi')
    assert bf.contains('hello'), 'BloomFilter failed to add item "hello"'
    assert bf.contains('hi'), 'BloomFilter failed to add item "hi"'
    h = .02  # step size in the mesh

    # we create an instance of SVM and fit out data. We do not scale our
    # data since we want to plot the support vectors
    C = 0.2  # SVM regularization parameter

    rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C).fit(X, Y)
    print('RBF: ')
    start = datetime.datetime.now()
    y_pred = rbf_svc.predict(X)
    end = datetime.datetime.now()
    print('========== Learned Bloom filter result =============')
    print("Learned Bloom average predict time: ", (end - start))
    y_label = [int(i) for i in Y]
    conf_matrix = confusion_matrix(y_label, y_pred)
    print(conf_matrix)
    # print(classification_report(y_label, y_pred))

    print('========== Traditional Bloom filter result =========')
    bloom = bloom.BloomFilter(len(X), fpr_b)
    for i in range(len(X)):
        if Y[i] == 1:
            bloom.add(X[i][0])
    result = []
    start = datetime.datetime.now()
    y_bloom = [bloom.check(x[0]) for x in X]
    end = datetime.datetime.now()
    print(bloom.size)
    print("Traditional Bloom average predict time: ", (end - start))
    print(confusion_matrix(y_label, y_bloom))
Пример #3
0
#!/usr/bin/env python
# Author Dario Clavijo 2017
# GPlv3

# used for checking have i been pwnd passwords against a bloomfilter

import bloom
import sys
import hashlib

#bf = bloom.BloomFilter(array_size=(1024**3)*8,do_hashing=True)
bf = bloom.BloomFilter(filename=sys.argv[1],
                       array_size=(1024**2) * 512,
                       do_hashing=False,
                       slice_bits=120,
                       slices=7,
                       ishex=True)

print bf.check(hashlib.sha1(sys.argv[2]).hexdigest())
Пример #4
0
#!/usr/bin/env python
# Author Dario Clavijo 2017
# GPlv3

import bloom
import sys
import fileinput

SIZEMB = int(sys.argv[1])
bf = bloom.BloomFilter(array_size=(1024**2) * SIZEMB,
                       do_hashing=False,
                       slice_bits=120,
                       slices=7,
                       ishex=True)

new = 0
seen = 0
fp = open(sys.argv[2], 'r+')
for line in fp:
    try:
        #h=str(int(line.rstrip(),16)).encode('utf8')
        h = line.rstrip()
        #print(h)
    except:
        h = None
    if h != None:
        if bf.update(h) == False:
            new += 1
        else:
            seen += 1
    print("new:%d seen:%d" % (new, seen))
Пример #5
0
import jsonpath
import json
import redis
import bloom
import pymysql.cursors

# ===== 连接数据库
conn = pymysql.connect(host="127.0.0.1",
                       user="******",
                       passwd="root",
                       db="zhihu",
                       charset='utf8',
                       use_unicode=True)
cursor = conn.cursor()

bf = bloom.BloomFilter(0.001, 100000000)

ssl._create_default_https_context = ssl._create_unverified_context

# ===== 设置头信息
ua = [
    'User-Agent:Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;TheWorld)'
    'User-Agent:Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0'
]
thisua = random.choice(ua)
headers = {"User-Agent": thisua}
headers1 = {
    'Cache-Control': 'max-age=0',
    'User-Agent': random.choice(ua),
Пример #6
0
import sys
import bloom

filename = sys.argv[1]

try:
    Gigs = int(sys.argv[2])
except ValueError as verr:
    print "Plase input the correct number of Gigabytes of RAM to be used."
    exit(1)

if Gigs > 0:
    bf = bloom.BloomFilter(array_size=Gigs * (1024**3),
                           do_bkp=False,
                           do_hashing=False,
                           fast=False)
    bf.save(filename)
Пример #7
0
#!/usr/bin/env python
# Author Dario Clavijo 2017
# GPlv3

import bloom
import sys

bf = bloom.BloomFilter()

fp = open(sys.argv[1], 'r')
for line in fp:
    bf.add(line.rstrip())
fp.close()

bf.save(sys.argv[2])
Пример #8
0
import sys
import bloom

try:
    array_size = int(sys.argv[2])
except:
    array_size = (1024**3) * 5

bf = bloom.BloomFilter(array_size=array_size,
                       do_bkp=False,
                       do_hashing=False,
                       bitshuffle=False)
bf.filename = sys.argv[1]
bf.save()
Пример #9
0
        with open(sample, 'r') as sample_fh:
            content = sample_fh.read()

        # Get text from HTML content
        words = html.fromstring(content).text_content().replace("\n", "")
        words = re.findall(r"[\w]+", words)
        # Remove all punctuation etc., convert words to lower and delete
        # duplicates
        words = list(set([word.lower() for word in words]))

        # Remove common words
        words = remove_common_words(words)
        # Stemming to reduce the number of words
        words = list(set([p.stem(word, 0, len(word)-1) for word in words]))

        tmp_filter = bloom.BloomFilter(capacity=len(words),
                                       error_rate=error_rate)
        for word in words:
            tmp_filter.add(word)

        filters.append(tmp_filter.buckets)

        pages.append({"title": re.search(r"@title=(.*)\n", content).group(1),
                      "url": sample[3:]})

    # First Int32 is length
    filters_to_write = struct.pack("<i", len(filters))
    # Then comes the length of each filter
    for i in filters:
        filters_to_write += struct.pack("<i", len(i))
    # Finally comes the filters themselves
    for i in filters: