Exemplo n.º 1
0
class SpellChecker():
    def __init__(self):
        self.dictionary = BloomFilter()
        word_list = open('/usr/share/dict/words', 'r')
        for word in word_list:
            self.dictionary.add(word.strip())

    def valid(self, string):
        return self.dictionary.includes(string)
 def testDumpGzippedAndLoadBloomFilter(self):
     bloom_filter = BloomFilter(self.BLOOM_CAPACITY, self.BLOOM_ERROR_RATE)        
     for key in self.all_keys:
         bloom_filter.add(key)           
     
     dump_bytes = bloom_filter.dump(gzipped=True);                
     bloom_filter2 = BloomFilter.load(dump_bytes)
     
     self.assertEqual(bloom_filter, bloom_filter2)                
     self.check_contains(bloom_filter2)
 def testDumpAndLoadBase64BloomFilter(self):
     bloom_filter = BloomFilter(self.BLOOM_CAPACITY, self.BLOOM_ERROR_RATE)        
     for key in self.all_keys:
         bloom_filter.add(key);           
     
     dump_str = bloom_filter.dump_to_base64_str(gzipped=True);                
     bloom_filter2 = BloomFilter.load_from_base64_str(dump_str);
     
     self.assertEqual(bloom_filter, bloom_filter2)                
     self.check_contains(bloom_filter2)
Exemplo n.º 4
0
class EventValidator:

    last = -1
    current = -1
    bloom = BloomFilter(4980000, 0.01)

    orderErrors = []
    uniqueErrors = []

    def __init__(self):
        self

    def checkOrder(self, value):

        if self.last == -1:
            self.last = value
        else:
            self.current = value
            if self.last > self.current:
                self.orderErrors.append(self.current)
            else:
                self.last = self.current

    def checkUnique(self, value):

        inside = value in self.bloom

        if inside:
            self.uniqueErrors.append(value)
        else:
            self.bloom.add(value)
Exemplo n.º 5
0
 def __init__(self, file_name):
     self.bf = BloomFilter(10000000, 8)
     input_file = open(file_name, "r")
     for file_line in input_file:
         file_line = file_line.rstrip()
         self.bf.add(file_line)
     input_file.close()
Exemplo n.º 6
0
def build_entity_bloom(p="../../data/buboqa/indexes/names_2M.pkl",
                       outp="../../data/buboqa/data/names_2M.labels.bloom"):
    tt = q.ticktock("bloom-entities")
    tt.tick("loading names pkl")
    entnames = pkl.load(open(p, "rb"))
    tt.tock("loaded names pkl")
    tt.tick("building bloom filter")
    fset = BloomFilter(2e6, 1e-3)
    for uri, names in tqdm(entnames.items()):
        for name in names:
            fset.add(name)
    tt.tock("built")
    tt.tick("saving bloom filter")
    with open(outp, "wb") as f:
        pkl.dump(fset, f)
    tt.tock("saved")
    return fset
Exemplo n.º 7
0
class AutoResetBloomFilter:
    RESET_TIME = 60 * 60 * 10  # NOTE 每10小时重置一次

    def __init__(self):
        self.bf = BloomFilter()
        self.last_reset_time = int(time.time())

    def add(self, v):
        now = int(time.time())
        if now - self.last_reset_time > self.RESET_TIME:
            logging.info("bloom filter reset")
            self.bf = BloomFilter()
            self.last_reset_time = now
        self.bf.add(v)

    def __contains__(self, key):
        return key in self.bf
Exemplo n.º 8
0
 def __init__(self, base_url: str, cfg: configparser) -> None:
     self.base_url = base_url
     self.config = cfg
     self.closure_url = '{scheme}://{netloc}'.format(
         scheme=urlsplit(self.base_url).scheme,
         netloc=urlsplit(self.base_url).netloc)
     self.pool = ThreadPoolExecutor(
         max_workers=int(self.config['MAX_WORKER']))
     self.task_queue = Queue(maxsize=3 * int(self.config['MAX_WORKER']))
     self.task_queue.put(self.base_url)
     self.crawled_pages = BloomFilter(
         max_elements=int(self.config['MAX_ELEMENTS']),
         error_rate=float(self.config['ERROR_RATE']))
     self.crawled_pages.add(self.base_url)
     self.total = 1
     self.lock = Lock()
     self.run_time = time.time()
Exemplo n.º 9
0
def process_two(pipe21, pipe23):
    pid = 1
    counter = BloomFilter(n, p, 0)
    counter = recv_message(pipe21, pid, counter, 'g')
    counter = send_message(pipe21, pid, counter, 'h')
    counter = send_message(pipe23, pid, counter, 'i')
    counter = recv_message(pipe23, pid, counter, 'j')
    print_history(counter, pid)
Exemplo n.º 10
0
    def __init__(self,
                 event_bus: EndpointAPI,
                 peer_pool: ETHProxyPeerPool,
                 tx_validation_fn: Callable[[BaseTransactionFields], bool],
                 token: CancelToken = None) -> None:
        super().__init__(token)
        self._event_bus = event_bus
        self._peer_pool = peer_pool

        if tx_validation_fn is None:
            raise ValueError('Must pass a tx validation function')

        self.tx_validation_fn = tx_validation_fn
        # 1m should give us 9000 blocks before that filter becomes less reliable
        # It should take up about 1mb of memory
        self._bloom = BloomFilter(max_elements=1000000)
        self._bloom_salt = str(uuid.uuid4())
Exemplo n.º 11
0
    async def filter(self, existed_vid_list=None):
        bloom = BloomFilter(max_elements=config.MAX_ESTIMATE_RECORD_NUMBER
                            )  ## construct a bloom filter
        for ele in existed_vid_list:
            bloom.add(ele)  ## add origin_id into the filter
        latest_results = []  ## final result to output
        ## The one who is responsible for the paging for xinpianchang do not have any kids
        buffer = config.check_latest_buffer
        latest = await self.fetch()
        for ele in latest:
            if bloom.__contains__(
                    ele['vid']):  ## determine whether if the ele is reocrded
                ## if the ele is recorded
                ## meaning that the upcoming ele are repeated
                ## so we just return the current latest_results

                ## but due to the unreasonable paging issue
                ## we need a buffer to make sure that we make it to the end
                if buffer == 0:
                    del bloom  ## release memory
                    return jmespath.search(
                        '[]', latest_results) if latest_results else []
                else:
                    buffer -= 1
                    continue
            else:
                bloom.add(ele['vid'])  ## add origin_id into the filter
                latest_results.append(ele)
        else:
            return jmespath.search('[]',
                                   latest_results) if latest_results else []
class DoubleBloom:
    def __init__(self, values, probability):
        self.zero_bloom = BloomFilter(values, probability, False)
        self.one_bloom = BloomFilter(values, probability, False)
        self.next_level = None

    def insert(self, key, value):
        if value == '0':
            self.zero_bloom.insert(key)
        else:
            self.one_bloom.insert(key)

    def get_value(self, key):
        if self.zero_bloom.contains(key):
            if self.one_bloom.contains(key):
                if self.next_level is not None:
                    return self.next_level.get_value(key)
                return 'Both'
            return '0'
        elif self.one_bloom.contains(key):
            return '1'
        return None

    def add_level(self, values, probability):
        self.next_level = DoubleBloom(values, probability)
Exemplo n.º 13
0
 def do_add_strings(self, string_lists):
     try:
         for key in string_lists:
             document = {}
             bf = BloomFilter(max_elements=2**16, error_rate=0.01)
             [bf.add(element) for element in string_lists.get(key)]
             bf_pickle = cPickle.dumps(bf)
             document[key] = bf_pickle
             sig = key.split('_')
             for type in self.collection:
                 if type == sig[len(sig) - 1]:
                     size = sys.getsizeof(document[key])
                     print(
                         "asynic save bloom tree into mongDB: %s \t size is %f"
                         % (key, size / 1024 / 1024))
                     break
     except Exception as e:
         print(e)
Exemplo n.º 14
0
def check():
    bloom_filter = BloomFilter(max_elements=2000000, error_rate=0.0001)
    with open('urls.jsonl', mode='w') as f:
        for file_name in os.listdir("url_outlinks"):
            with open(f"url_outlinks/{file_name}", mode='r') as fp:
                data = json.load(fp)

                for domain, obj in data.items():
                    for url, outlinks in obj.items():
                        if url not in bloom_filter:
                            f.write(
                                f"""{json.dumps({"url": url, "no_outlinks": len(outlinks), "outlinks": outlinks})}\n"""
                            )
                            bloom_filter.add(url)

                fp.close()
            f.flush()
        f.close()
Exemplo n.º 15
0
class BLOOMDupeFilter(BaseDupeFilter):
    """Request Fingerprint duplicates filter"""
    def __init__(self, path=None):
        self.file = None
        self.fingerprints = BloomFilter(10000, 0.00001)

    @classmethod
    def from_settings(cls, settings):
        return cls(job_dir(settings))

    def request_seen(self, request):
        fp = request.url
        if fp in self.fingerprints:
            return True
        self.fingerprints.add(fp)

    def close(self, reason):
        self.fingerprints = None
Exemplo n.º 16
0
def process_one(pipe12):
    pid = 0
    counter = BloomFilter(n, p, 0)
    counter = event(pid, counter, 'b')
    counter = send_message(pipe12, pid, counter, 'c')
    counter = event(pid, counter, 'd')
    counter = recv_message(pipe12, pid, counter, 'e')
    counter = event(pid, counter, 'f')
    print_history(counter, pid)
Exemplo n.º 17
0
 def add(self, key):
     bfilter = self.sbfilters[-1]
     if bfilter.can_accomodate() == False:
         new_expected_inserts = bfilter.expected_inserts * self.space_scale
         new_error_rate = bfilter.error_rate * self.error_prob_ratio
         new_bfilter = BloomFilter(new_expected_inserts, new_error_rate)
         self.sbfilters.append(new_bfilter)
         bfilter = new_bfilter
     bfilter.add(key)
Exemplo n.º 18
0
    def put_all(self, kvps):
        timestamp = self.next_timestamp()

        txn_keys = None
        if self.algorithm == RAMPAlgorithm.Fast:
            txn_keys = kvps.keys()

        bloom_filter = None
        if self.algorithm == RAMPAlgorithm.Hybrid:
            bloom_filter = BloomFilter(BLOOM_FILTER_SIZE, BLOOM_FILTER_HASHES)
            bloom_filter.list_to_bloom(kvps.keys())

        for key in kvps:
            self.key_to_partition(key).prepare(
                key, DataItem(kvps[key], timestamp, txn_keys, bloom_filter),
                timestamp)

        for key in kvps:
            self.key_to_partition(key).commit(key, timestamp)
Exemplo n.º 19
0
 def __init__(self, start_urls: List[str], crawled_pages_count: int,
              chunk_size: int, fetch_workers: int, database_workers: int):
     # При использоание set на больших объемах достигнем лимита
     # памяти.Поэтому используем фильтр блума, при этом проигрываем
     # по скорости.
     self._visited = BloomFilter(max_elements=crawled_pages_count)
     self._logger = get_logger(__name__)
     self._stop_crawling = False
     self._urls = start_urls
     self._data = []
     self._buffer = []
     self._total_crawled_pages = 0
     self._stop_crawling = False
     self._fetch_error_rate = 0.9
     self._crawled_pages_count = crawled_pages_count
     self._chunk_size = chunk_size
     self._fetch_workers = fetch_workers
     self._database_workers = database_workers
     self._max_buffer_len = self._chunk_size * self._fetch_error_rate
Exemplo n.º 20
0
 def __init__(self, env, feature_transformer):
     self.env = env
     self.models = {}
     self.models_elite = {}
     self.feature_transformer = feature_transformer
     for a in env.actions_available:
         self.models[a] = PassiveAggressiveRegressor(
             C=1.0,
             fit_intercept=True,
             shuffle=False,
             loss='epsilon_insensitive',
             epsilon=0.1)
         self.models_elite[a] = PassiveAggressiveRegressor(
             C=1.0,
             fit_intercept=True,
             shuffle=False,
             loss='epsilon_insensitive',
             epsilon=0.1)
     self.bloom_states = BloomFilter(max_elements=256**2)
Exemplo n.º 21
0
    def __init__(self, settings, stats):
        self.never_cache = set(
            urlparse(url).path for url in settings.get("NEVER_CACHE", []))
        self.never_cache.add(urlparse("/robots.txt").path)

        logger.info(
            f"Initiating bloom filter.... Never Cache paths: {sorted(self.never_cache)}"
        )

        self.visited = BloomFilter(
            max_elements=settings.getint("VISITED_FILTER_MAX_REQUESTS",
                                         4000000),
            error_rate=settings.getfloat("VISITED_FILTER_ERROR_RATE", 1e-9),
            filename=settings.get("VISITED_FILTER_PATH"),
        )
        self.stats = stats
        logger.info(
            f"Loaded visited urls bloomfilter. Size {self.visited.num_bits_m / (1024 ** 2 * 8)} MiB."
        )
Exemplo n.º 22
0
    def add_test(self):
        try:
            for i in range(1000):
                # Pick a random number for string size.
                string_size = random.randrange(0, 10000)
                # Create random alphanumeric strings.
                random_string = ''.join(
                    random.choices(string.ascii_uppercase + string.digits,
                                   k=string_size))

                # The size of the filter is not important in this test.
                bloom = BloomFilter(bit_vector_size=10)
                # Add the random string to the filter.
                bloom.add(random_string)

            logging.info("add_test: PASS")

        except ValueError:
            logging.error("add_test: FAIL. A random string caused an error.")
Exemplo n.º 23
0
def bloom_filter(fp, pattern):
    ret = f'Query {pattern} pattern in date set\n'
    for path in fp:
        stock_name = os.path.basename(path)[:-4]
        mvg,date = moving_average(path,10)
        bmvg = trans2_01(mvg)
        bloom = BloomFilter(max_elements=10000)
        length = len(pattern)
        ele = ''
        for i in range(length):
            ele = ele + str(bmvg[i])
        for i in range(length - 1, len(bmvg)):
            if i != length - 1:
                ele = ele + str(bmvg[i])
                ele = ele[1:]
            bloom.add(ele)
        if pattern in bloom:
            ret = ret + f'Find {pattern} pattern in {stock_name}\n'
    return ret
Exemplo n.º 24
0
    def __init__(self, server, key, debug=False):
        """Initialize the duplicates filter.

        Parameters
        ----------
        server : redis.StrictRedis
            The redis server instance.
        key : str
            Redis key Where to store fingerprints.
        debug : bool, optional
            Whether to log filtered requests.

        """
        self.server = server
        self.key = key
        self.debug = debug
        self.logdupes = True

        self.bloomfilter = BloomFilter(redis_conn=server, key=key)
Exemplo n.º 25
0
 def __init__(self, *args, **kwargs):
     # // 要爬取网站的跟
     self.base_url = 'http://wzggzy.wz.gov.cn/'
     # super(QhSpider, self).__init__(*args, **kwargs)
     self.bloom_filter = BloomFilter(max_elements=1000000,
                                     error_rate=0.1,
                                     filename='bf.data')
     self.num = 0
     self.scrawl_mode = ScrawlMode.HISTORY
     self._stop_parse = False
Exemplo n.º 26
0
class StatisticsHolder:
    MAX_DELAY = 5

    def __init__(self, start_ts, end_ts):
        self.counter = 0
        self.start_ts = start_ts
        self.end_ts = end_ts
        self.filter = BloomFilter(max_elements=50000, error_rate=0.01)

    def add(self, uid):
        if uid not in self.filter:
            self.counter += 1
            self.filter.add(uid)

    def __contains__(self, ts):
        return ts >= self.start_ts and ts < self.end_ts

    def closed_by(self, ts):
        return ts >= self.end_ts + StatisticsHolder.MAX_DELAY
Exemplo n.º 27
0
class Node:
    def __init__(self, k, expected_num, fp_prob):
        """
        Represents a single node of Bloom Tree

        """
        self.children: List[Node] = []
        self.parent: Optional[Node] = None
        self.filter = BloomFilter(expected_num, fp_prob)

        self.dataset_id: Optional[str] = None
        self.k = k

    def populate_dataset_info(self, dataset: List[Read]) -> None:
        self.dataset_id = dataset[0].filename
        self.insert_kmers_from_dataset(dataset)

    def insert_kmers_from_dataset(self, dataset: List[Read]) -> None:
        for read in dataset:
            for kmer in read.kmers(self.k):
                self.filter.insert(kmer)

    def add_node_kmers(self, other: 'Node') -> None:
        self.filter.filter |= other.filter.filter

    def num_children(self) -> int:
        return len(self.children)

    def score(self, other: 'Node') -> int:
        """
        "Hamming distance" score where lower is better
        :param other: The node to compare against
        """
        return count_xor(self.filter.filter, other.filter.filter)

    def get_size(self):
        """
        Returns the total number of bytes occupied by the filter object
        """
        return (sys.getsizeof(self.children) + sys.getsizeof(self.parent) +
                sys.getsizeof(self.dataset_id) + sys.getsizeof(self.k) +
                self.filter.get_size())
Exemplo n.º 28
0
def false_positive_rate_benchmark(error_rate):
    trial_arr = []
    fps_rate = []
    capacity = 10000
    bfilter = BloomFilter(capacity, error_rate)
    for trial in range(10000, 100000, 1000):
        fp = 0
        for i in range(trial):
            bfilter.add(i)
        for i in range(trial, 2 * trial + 1):
            if i in bfilter:
                fp += 1
        fp_rate = fp / float(trial)
        fps_rate.append(fp_rate)
        trial_arr.append(trial)

    plt.plot(trial_arr, fps_rate, 'bs')
    plt.ylabel('False Positive Rate')
    plt.xlabel('No of Trials')
    plt.show()
Exemplo n.º 29
0
 def __init__(self):
     self.headers = {
         'User-Agent':
         'Dalvik/2.1.0 (Linux; U; Android 8.0.0; MIX 2 MIUI/V10.2.2.0.ODECNXM) NewsArticle/8.4.4'
     }
     self.cookies = {
         'cookies':
         'install_id=6672646082571388678; ttreq=1$a9ed7f4ce8fc84fced473d6e25c22226f381c13d; odin_tt=3e76568447d177856560d524c6ef5400407a437cfdd62767a36fb3b2decdeb01d43b9a7978232dc05c57af3c81bd10c277e78619093795e8392c1302c9aa8a75; sid_guard=c8f84a23bcce86b376964aeb42991709%7C1554173959%7C5184000%7CSat%2C+01-Jun-2019+02%3A59%3A19+GMT; uid_tt=2ad7176029f7302e11b7924e6e6566b7120075732cedcd39bc999fa5cbcf07a1; sid_tt=c8f84a23bcce86b376964aeb42991709; sessionid=c8f84a23bcce86b376964aeb42991709'
     }
     self.headers_details = {
         'User-Agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
     }
     self.cookies_details = {
         'Cookie':
         'tt_webid=6683297640216282629; __tea_sdk__user_unique_id=6683297640216282629; __tea_sdk__ssid=40d2e59e-696c-4a93-ace8-e1479b10aeef; csrf-token=61575f8b568b577d9d06c777d103ae53e6c10723; csrf-secret=6qDUsFL6WZ1aG2soaPw7PpmCtnxCv7fw'
     }
     self.post_video_url = 'http://127.0.0.1:30008/crawler/video/transfer'
     self.filter_url = 'http://console.cc.clipclaps.tv/crawler/log'
     self.have_met = BloomFilter(max_elements=100000, error_rate=0.1)
Exemplo n.º 30
0
class WordLookup:
    def __init__(self, file_name):
        self.bf = BloomFilter(10000000, 8)
        input_file = open(file_name, "r")
        for file_line in input_file:
            file_line = file_line.rstrip()
            self.bf.add(file_line)
        input_file.close()

    def is_qualified(self, string):
        str_len = len(string)
        if str_len != 6:
            return False
        for i in range(1, str_len - 1):
            first = string[:i]
            second = string[i:]
            if self.bf.lookup(first) and self.bf.lookup(second):
                #print first + '+' + second + '=>' + string
                return True
        return False
Exemplo n.º 31
0
def process_three(pipe32, return_dict):
    pid = 2

    counter = BloomFilter(n, p, 0)
    print('\nInitial Bit array: 3')
    print(counter.bit_array)
    print()
    counter = recv_message(pipe32, pid, counter, 'k')
    counter = send_message(pipe32, pid, counter, 'l')
    print_history(counter, pid)
    for i in counter.history.keys():
        return_dict[i] = counter.history[i]
Exemplo n.º 32
0
def test_bloom_filter():
    bloomfilter = BloomFilter(NUM_KEYS, FALSE_POSITIVE_PROBABILITY) 
    word_present = ['abound','abounds','abundance','abundant','accessable', 
                    'bloom','blossom','bolster','bonny','bonus','bonuses', 
                    'coherent','cohesive','colorful','comely','comfort', 
                    'gems','generosity','generous','generously','genial'] 
    
    word_absent = ['facebook','twitter'] 
    
    for item in word_present: 
        bloomfilter.add(item) 
    
    test_words = word_present[:10] + word_absent 
    shuffle(test_words) 
    for word in test_words: 
        if bloomfilter.is_member(word): 
            if word in word_absent: 
                print(f"'{word}' is a false positive!") 
            else: 
                print(f"'{word}' is probably present!")
        else: 
            print(f"'{word}' is definitely not present!") 
Exemplo n.º 33
0
    def put_all(self, kvps):
        timestamp = self.next_timestamp()

        txn_keys = None
        if self.algorithm == RAMPAlgorithm.Fast:
            txn_keys = kvps.keys()

        bloom_filter = None
        if self.algorithm == RAMPAlgorithm.Hybrid:
            bloom_filter = BloomFilter(BLOOM_FILTER_SIZE, BLOOM_FILTER_HASHES)
            bloom_filter.list_to_bloom(kvps.keys())
            
        for key in kvps:
            self.key_to_partition(key).prepare(key,
                                               DataItem(kvps[key],
                                                        timestamp,
                                                        txn_keys,
                                                        bloom_filter),
                                               timestamp)

        for key in kvps:
            self.key_to_partition(key).commit(key, timestamp)
Exemplo n.º 34
0
def generate_bf_list(cols):
    block_cnt = 20
    block_len = 30
    n = block_cnt * block_len  # code space. set it to the max size of a col for now
    p = 0.01  # false positive probability

    # build bloom filter for all cols
    bloom_filter_list = []
    for col in cols:
        bloom_filter = BloomFilter(n, p)
        for num in col:
            bloom_filter.add(chr(num))
        bloom_filter_list.append(bloom_filter)
    return bloom_filter_list
Exemplo n.º 35
0
def process_two(pipe21, pipe23, return_dict):
    pid = 1

    counter = BloomFilter(n, p, 0)
    print('\nInitial Bit array: 2')
    print(counter.bit_array)
    print()
    counter = recv_message(pipe21, pid, counter, 'g')
    counter = send_message(pipe21, pid, counter, 'h')
    counter = send_message(pipe23, pid, counter, 'i')
    counter = recv_message(pipe23, pid, counter, 'j')
    print_history(counter, pid)
    for i in counter.history.keys():
        return_dict[i] = counter.history[i]
 def do_add(self, string_lists):
     """
     :param string_lists: eg. {lib1_fn: string_list, lib1_1g: string_list, lib1_2g: string_list, ..., }
     :return:
     """
     try:
         for key in string_lists:
             document = {}
             bf = BloomFilter(max_elements=2**16, error_rate=0.01)
             [bf.add(element) for element in string_lists.get(key)]
             bf_pickle = cPickle.dumps(bf)
             document[key] = bf_pickle
             sig = key.split('_')
             for type in self.collection:
                 if type == sig[len(sig) - 1]:
                     self.collection[type].save(document)
                     size = sys.getsizeof(document[key])
                     print(
                         "save bloom tree into mongoDB: %s \t size is %f M"
                         % (key, size / 1024 / 1024))
                     break
     except Exception as e:
         print(e)
Exemplo n.º 37
0
def main():
    inital_page = "http://yue.ifeng.com"

    url_queue = Queue.Queue()
    filter = BloomFilter()
    filter.add(inital_page)
    url_queue.put(inital_page)

    while True:
        urls = []
        current_url = url_queue.get()  # 取队列第一个元素
        try:
            store(current_url)
            urls = extract_urls(current_url)  # 抽取页面中的链接
        except Exception, e:
            print "Error extract_urls"
            print e
        for next_url in urls:
            if filter.notcontains(next_url):
                filter.add(next_url)
                url_queue.put(next_url)
Exemplo n.º 38
0
    count+=1
f.close()
    
num_of_items= count
print("Number of Items: " + str(num_of_items))
p = 0.005
print("Probability of false positive error " + str(p))


bit_size = bit_array_size(num_of_items, p)
print("Bit Size: "+str(bit_size))

hash_size = size_of_hash(num_of_items, bit_size)
print("Hash Size: "+str(hash_size))

bf = BloomFilter(num_of_items, hash_size)
word_list = open("word_list.txt").read().splitlines()

for word in word_list:
    bf.add(word)
word_list.close()
    
print(bf.lookup("99"))

print(bf.lookup("donkey")) 
print(bf.lookup("oitqv")) 
print(bf.lookup("fart"))
print(bf.lookup("Max"))
print(bf.lookup("Dichha"))
print(bf.lookup("Khuwalung"))
Exemplo n.º 39
0
import sys
from trie import Trie
from bloom_filter import BloomFilter
from tools import get_size

if __name__ == '__main__':
    bf_dups = 0
    tr = Trie()
    bf = BloomFilter(capacity=700000, error_rate=0.001)
    with open("words.txt") as file:
        for line in file:
            tr.put(line.strip())
            if bf.put(line.strip()):
                print("Duplicate in bloom filter: {0}".format(line.strip()))
                bf_dups += 1

    print("Trie. number of objects put: {0}".format(len(tr)))
    print("Bloom filter. number of objects put: {0}".format(len(bf)))
    print()
    print("Trie. Size of the object: {0}".format(sys.getsizeof(tr)))
    print("Bloom filter. Size of the object: {0}".format(sys.getsizeof(bf)))
    print()
    print("Trie. Size of the object(full): {0}".format(get_size(tr)))
    print("Bloom filter. Size of the object(full): {0}".format(get_size(bf)))
    print()
    print("Bloom filter errors: {0}".format(bf_dups))
    print("----------------------------------------------------------")
Exemplo n.º 40
0
 def __init__(self):
     self.dictionary = BloomFilter()
     word_list = open('/usr/share/dict/words', 'r')
     for word in word_list:
         self.dictionary.add(word.strip())
 def test_can_insert(self):
     bloom = BloomFilter(2000, 4)
     bloom.insert(5)
 def test_inserted_is_probably_contained(self):
     bloom = BloomFilter(2000, 4)
     bloom.insert(42)
     self.assertTrue(42 in bloom)
 def testLoadBloomFilterFromBase64File(self):
     with open(self.BF_DUMP_FILE_BASE_64, "r") as f:
         dump_str = f.read()
     bloom_filter = BloomFilter.load_from_base64_str(dump_str)
     self.check_contains(bloom_filter)
 def testLoadBloomFilterFromFile(self):
     with open(self.BF_DUMP_FILE, "rb") as f:
         dump_bytes = f.read()
     bloom_filter = BloomFilter.load(dump_bytes)
     self.check_contains(bloom_filter)