Exemplo n.º 1
0
class BloomFilter():

    def __init__(self, *hash_functions, **kwds):
        """ @param max_size: In bytes 
        """
        self.__bitset = BitSet()
        
        if not hash_functions:
            hash_functions = (object_hash, object_repr_hash, object_str_hash, object_id)
        self.__hash_functions = hash_functions
        
        max_size = kwds.get("max_size", 1024)
        itemsize = self.__bitset.itemsize
        self.__max_bits = int(itemsize * ceil(float(max_size) / itemsize)) << 3
    
    def extend(self, values):
        map(self.add, values)
    
    def __get_bit_indexes_of(self, value):
        return (hf(value) % self.__max_bits for hf in self.__hash_functions)
    
    def add(self, value):
        indexes_to_set = self.__get_bit_indexes_of(value)
        self.__bitset.set_indexes(indexes_to_set)
    
    def __contains__(self, value):
        indexes_to_get = self.__get_bit_indexes_of(value)
        bits = (index in self.__bitset for index in indexes_to_get)
        return all(bits)
    
    def __len__(self):
        return len(self.__bitset) >> 3
    
    def __repr__(self):
        return "BloomFilter (%s bytes): %s" % (len(self), self.__bitset)
Exemplo n.º 2
0
 def find_matches(self, inverted_index):
     bitset = BitSet()
     bitset.set_indexes(inverted_index.get_doc_ids_with(self._field_id, self._value))
     return bitset