def test_speed(): n = 10000 p = 0.0001 b = BloomFilter(n, p) print b strings = set() string_size = 20 for i in range(n): string = "" for j in range(string_size): string += chr(random.randint(0, 255)) strings.add(string) total_time = 0 starttime = time.time() for string in strings: b.add(string) total_time = (time.time() - starttime) ns = float(len(strings)) k = float(b.k) total_time = float(total_time) print "Number of hash functions: %d" % b.k print "Speed per hash: %f seconds" % (total_time / ns / k) print "Speed per add: %f seconds" % (total_time / ns)
def __init__(self, depth: int, max_url_nums: int, cookies: str, exclude_urls: List[str], domain_reg_list: List[str], path_dicts: List[str], header: dict = None): self.domain_reg = '' self.domain_reg_list = domain_reg_list self.complement = 0 self.depth = 5 if not depth else depth self.max_url_nums = 5000 if not max_url_nums else max_url_nums self.cookie = cookies self.exclude_urls = [url.replace('*', '\\S*') for url in exclude_urls] self.url_dict = dict() self.url_cache = BloomFilter(element_num=max_url_nums * 5, error_rate=0.01) self.current_depth = 0 self.current_crawl_queue = list() self.next_crawl_queue = list() self.max_queue_length = self.max_url_nums + 1000 self.header = header self.path_dicts = path_dicts self.header = header self.filter_exts = [ 'css', 'png', 'gif', 'jpg', 'jpeg', 'swf', 'tiff', 'pdf', 'ico', 'flv', 'mp4', 'mp3', 'avi', 'mpg', 'gz', 'mpeg', 'iso', 'dat', 'mov', 'rar', 'exe', 'zip', 'tar', 'bin', 'bz2', 'xsl', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx', 'csv', 'map', "ttf", 'tif', 'woff', 'woff2', 'cab', 'apk', 'bmp', 'svg', 'exif', 'xml', 'rss', 'webp' ] self.exclude_urls_reg_str = ''
def test_error_rate(): n = 10000 p = 0.001 b = BloomFilter(n, p) print "Creating BloomFilter for %d elements and false positive probability = %f ..." % (n, p) print "Optimal values are m = %d, k = %d" % (b.m, b.k) elt = 'apple' print "Testing..." assert elt not in b print "After adding '%s'..." % elt b.add(elt) print "Testing..." assert elt in b # create random strings strings = set() string_size = 20 for i in range(n): string = "" for j in range(string_size): string += chr(random.randint(0, 255)) strings.add(string) # other strings other_strings = set() for i in range(n): string = "" for j in range(string_size): string += chr(random.randint(0, 255)) other_strings.add(string) # add all to set for s in list(strings): b.add(s) # test for collisions other_strings = other_strings - strings collisions = 0 for s in list(other_strings): if s in b: collisions += 1 print "False positive rate was %d / %d = %f" % ( collisions, len(other_strings), float(collisions) / float(len(other_strings)))
def __init__(self, cookie: str = None, headers: dict = None, max_num: int = 10000, domain_regs: list = None, depth: int = 5): self.cookie = cookie self.headers = headers if headers else DEFAULT_HEADERS self.waiting_queue = Manager().Queue(maxsize=max_num * 2) self.current_queue = Manager().Queue(maxsize=max_num * 2) self.max_url_num = max_num self.crawled_urls = BloomFilter(element_num=max_num * 5, error_rate=0.01) self.url_dict = Manager().dict() self.domain_reg_list = domain_regs self.depth = depth self.current_depth = 0 self.filter_exts = [ 'css', 'png', 'gif', 'jpg', 'jpeg', 'swf', 'tiff', 'pdf', 'ico', 'flv', 'mp4', 'mp3', 'avi', 'mpg', 'gz', 'mpeg', 'iso', 'dat', 'mov', 'rar', 'exe', 'zip', 'tar', 'bin', 'bz2', 'xsl', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx', 'csv', 'map', "ttf", 'tif', 'woff', 'woff2', 'cab', 'apk', 'bmp', 'svg', 'exif', 'xml', 'rss', 'webp', 'js' ]
def setUp(self): self.bf = BloomFilter(0.001,10**3)