예제 #1
0
    def test_functionality(self):
        bf = inbloom.Filter(20, 0.01)
        keys = [
            "foo", "bar", "foosdfsdfs", "fossdfsdfo", "foasdfasdfasdfasdfo",
            "foasdfasdfasdasdfasdfasdfasdfasdfo"
        ]
        faux = ["goo", "gar", "gaz"]
        for k in keys:
            bf.add(k)

        for k in keys:
            assert bf.contains(k)

        for k in faux:
            assert not bf.contains(k)

        expected = '02000C0300C2246913049E040002002000017614002B0002'
        actual = hexlify(bf.buffer()).upper()
        assert expected == actual
예제 #2
0
    def test_dump_load(self):
        bf = inbloom.Filter(20, 0.01)
        bf.add('abc')
        expected = '620d006400000014000000000020001000080000000000002000100008000400'
        actual = hexlify(inbloom.dump(bf))
        assert expected == actual

        bf = inbloom.load(inbloom.dump(bf))
        actual = hexlify(inbloom.dump(bf))
        assert expected == actual

        data = inbloom.dump(bf)
        data = str([0xff, 0xff]) + data[2:]

        with self.assertRaisesRegexp(inbloom.error, "checksum mismatch"):
            inbloom.load(data)

        data = data[:4]
        with self.assertRaisesRegexp(inbloom.error, "incomplete payload"):
            inbloom.load(data)
예제 #3
0
def build_filters():
    logging.info("set up our temp DB")
    set_up_temp_db()

    logging.info("Figuring out all of the phrases we have in our corpus")
    deals_count = 0
    for deal in load_recent_deals():
        deals_count += 1
        try:
            phrases = get_all_phrases_for(deal)
            for phrase in phrases:
                if len(phrase) < MIN_PHRASE_LENGTH:
                    continue
                phrase_rowid = update_count_for(phrase)
                save_deal_to_phrase_link(deal.deal_id, phrase_rowid)
        except Exception as e:
            logging.exception(e)
        if deals_count % 50 == 0:
            logging.info("Processed %d deals so far" % deals_count)
    logging.info("There were %d deals" % deals_count)

    total_phrase_count = load_total_phrase_count()
    logging.info("There were %d phrases - K ceiling is %d" % (total_phrase_count, UP_TO_K_MOST_FREQUENT_PHRASES))

    logging.info("Building bloom filter")
    bloom_filter = inbloom.Filter(
        entries=UP_TO_K_MOST_FREQUENT_PHRASES,
        error=0.0001
    )

    for phrase, frequency in load_up_to_k_phrases_with_frequencies(UP_TO_K_MOST_FREQUENT_PHRASES):
        logging.debug("Loaded phrase: %s, which had frequency %d" % (phrase, frequency))
        bloom_filter.add(phrase)
    logging.info("Bloom filter and sketch built OK")

    return bloom_filter, None
예제 #4
0
 def __init__(self,bloom_capacity,error_rate):
     self.bloom_capacity = bloom_capacity
     self.error_rate = error_rate
     self.bf = inbloom.Filter(entries=bloom_capacity, error=error_rate)
예제 #5
0
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# ******************************************************
# DESC    :
# AUTHOR  : Alex Stocks
# VERSION : 1.0
# LICENCE : Apache License 2.0
# EMAIL   : [email protected]
# MOD     : 2017-03-03 17:22
# FILE    : bloom.py
# ******************************************************

import inbloom
import base64
import requests

# Basic usage
uids = ['u0667601477730140020006032', 'u0824011478254848030001156', 'u1768771480923934030001199', 'u2413521474991179020001113', 'u3686801478240154030005408', 'u3757851481174023020001166', 'u3795351482317225020001244', 'u4555881480912962010001166', 'u4710551480917824010001244', 'u4881041468996697020001184', 'u4881041474530826010001095', 'u4905391484641165010001141', 'u5425051481012754020001255', 'u6450591484584368010001166', 'u6937721484632783020001218', 'u7414411480402599030001267', 'u7675311485144153020001211', 'u9587291477909978030001277', 'u9805521470914539020001149']
bf = inbloom.Filter(entries=len(uids), error=0.001)
for uid in uids:
    bf.add(uid)

res = base64.b64encode(inbloom.dump(bf))
# yoID6AAAABMNJALf42lALcpcu2WH9sZcLPWh/g+ynjcSVaWfDxVuudTRAA==
print len(bf.buffer())
print res, len(res)
bf = inbloom.load(base64.b64decode(res))
print bf.contains(uids[0])
print bf.contains('u0')