def __init__(self, **kwargs): # args super(CarSpider, self).__init__(**kwargs) #problem report self.mailer = MailSender.from_settings(settings) self.counts=0 # Mongo settings.set('CrawlCar_Num', carnum, priority='cmdline') settings.set('MONGODB_DB', 'usedcar', priority='cmdline') settings.set('MONGODB_COLLECTION', website, priority='cmdline') #mysql # mysql mysqldb = MySQLdb.connect("192.168.1.94", "root", "Datauser@2017", "usedcar", port=3306) mysqldbc = mysqldb.cursor() # read mysqldbc.execute("select newcarurl from che58") items = mysqldbc.fetchall() self.urllist=[] df =pybloom.BloomFilter(carnum,0.01) for i in items: j=i[0] md5i= hashlib.md5(j) rf = df.add(md5i) if not rf: self.urllist.append(j)
def process_loops(self, node, context): if "path" not in context: context["path"] = [] if "loop?" not in context: context["loop?"] = False if "detection" not in context: context["detection"] = 0 if "bf" not in context: context["bf"] = pb.BloomFilter(self.capacity, self.error_rate) if "loopstart" not in context: try: context["loopstart"] = context["path"].index(node) context["loopsize"] = len(context["path"]) - context["loopstart"] except ValueError: pass if (node in context["bf"]): context["detection"] += 1 if context["detection"] >= self.detections: context["loop?"] = True return False context["path"].append(node) context["bf"].add(node) return True
def report(self, oneline = False): nl = "," if oneline else "\n" bf = pb.BloomFilter(self.capacity, self.error_rate) print self.__class__.__name__, nl, print self.pcsv("Null:"), "--", nl, print self.pcsv("Cap:"), self.capacity, nl, print self.pcsv("Rate:"), self.error_rate, nl, print self.pcsv("Hashes:"), bf.num_slices, nl, print self.pcsv("Mem:"), bf.num_bits + math.log(self.detections, 2), self.pcsv("bits"), nl, super(self.__class__, self).report(oneline)
def get_bf(self, w_size, offset): bitshred = pybloom.BloomFilter(capacity=10000, error_rate=0.001) for i in range(-1+offset, (len(self.opcodes) - w_size)*-1, -1): window = "" for asm in self.opcodes[i:i - w_size:-1]: window += asm + "\n" #print("dbg: window:\n%s" % window) #self.dbg_windows.append(window) bitshred.add(window) return bitshred
def Bloom_Init(mysqltable, collection, mysqldbc, bfrate=0.001, keycol="statusplus"): # pybloom num = collection.count() * 1.1 df = pybloom.BloomFilter(capacity=num, error_rate=bfrate) # read mysqldbc.execute("select " + keycol + " from " + mysqltable) items = mysqldbc.fetchall() for i in items: item = hashlib.md5(i[0]).hexdigest() df.add(item) return df
def __init__(self, n=10000): self.count = 0 self.max_size = 100 self.cache = queue.Queue(0) self.pageset = pybloom.BloomFilter(n) self.lock = threading.Lock() self.url_header = { 'User-Agent': 'Mozilla/5.0 (compatible; ' 'Googlebot/2.1; +http://www.google.com/bot.html)', 'Proxy-Connection': 'keep-alive', 'Pragma': 'no-cache', 'Cache-Control': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'DNT': '1', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-SG,zh;q=0.9,zh-CN;q=0.8,en;q=0.7,zh-TW;q=0.6' } self.start_time = time.time() self.logger = utils.get_logger("cadre.spider") self.open_spider()
def main(): setup=json.load(open(sys.argv[1],'r')) feeds=json.load(open(sys.argv[2],'r')) bloom=pb.BloomFilter(1000000) bloomloc=sys.argv[3] try: bloom=bloom.fromfile(open(bloomloc,'r')) except: print "starting over" pass z = "" for x,y in feeds.iteritems(): temp_title ="<h1>" + x + "</h1><br>\n " temp_feeds = feeds_to_html(parse_feeds(y, bloom)) if len(temp_feeds)>0: z += temp_title+temp_feeds print x, len(temp_feeds) if len(z) > 0: send_email("RSS digest", "<body>\n" + z + "</body>", setup) bloom.tofile(open(bloomloc,'w'))
## Import external configuration if set if args.config: dirname = os.path.dirname(args.config) basename = os.path.basename(args.config) modname = os.path.splitext(basename)[0] sys.path.insert(0, dirname) globals().update(importlib.import_module(modname).__dict__) # ## Generate bf_error_rates if not set if enbloomfilter: if not len(bf_error_rates): bf_num_bits_pairs = [(pb.BloomFilter(bf_capacity, p).num_bits, p) for p in [(z + 1) / 1000000. for z in xrange(999999)]] bf_error_rates = [ w for (i, (q, w)) in enumerate(bf_num_bits_pairs) if q < bf_num_bits_pairs[i - 1][0] ] # ## Override number of runs if set if args.runs: packets = args.runs # ## Create topology if necessary
dirname = os.path.dirname(args.config) basename = os.path.basename(args.config) modname = os.path.splitext(basename)[0] sys.path.insert(0, dirname) globals().update(importlib.import_module(modname).__dict__) # ## Generate bf_error_rates if not set if enbloomfilter: if not len(bf_error_rates): bf_num_bits_pairs = [(pb.BloomFilter(bf_capacity, p).num_bits, p) for p in [(z+1)/1000000. for z in xrange(999999)]] bf_error_rates = [w for (i,(q,w)) in enumerate(bf_num_bits_pairs) if q < bf_num_bits_pairs[i-1][0]] # ## Override number of runs if set if args.runs: packets = args.runs # ## Create topology if necessary if topoloops or topopaths: topo = Topology.load(topofile, parser=topoparser, create_hosts=True, allcycles=True, directed=False)
def __init__(self): self.scrawled_urls = pybloom.BloomFilter(n=10000000)
# prng = np.random.RandomState(1) # pt=prng.rand(1,1000000) # def check(list,blf): count=0 print('xx:',type(list),len(list)) i=0 for x in list: if i<2: print('t:',blf.add(x)[0]) i+=1 if blf.add(x)[0]==False: count+=1 print('w:',count) return count f = pb.BloomFilter(capacity=1500000, error_rate=0.01) pl1=[] # # f1 = pb.BloomFilter(capacity=1050, error_rate=0.001) oldtime=time.time() for x in plx: _, bi0 = f.add(x) newtime=time.time() print('0:',(newtime-oldtime)) newtime=time.time() oldtime=time.time() _, bi1= f.add(1) newtime=time.time() # print('0:',oldtime) # print('1:',newtime) print('1:',(newtime-oldtime)) # for x in plx:
import pybloom import sys if __name__ == "__main__": capacity = sys.argv[1] bf = pybloom.BloomFilter(capacity=int(capacity), error_rate=0.001) for line in sys.stdin: if not bf.add(line.strip()): print line.strip()
import requests import urllib.parse from bs4 import BeautifulSoup import threading import queue import os import time import pybloom count = sum([ len(x) for _, _, x in os.walk(os.path.dirname("../dsimage/file/")) ]) # index of file MAX_SIZE = 10000 # stop when the count reaches MAX_SIZE INIT_URL = "http://www.mm4000.com/" lock_count = threading.Lock() pageset = pybloom.BloomFilter(100000) imgset = pybloom.BloomFilter(100000) cache = queue.Queue(0) stack = queue.LifoQueue(0) stack.put(INIT_URL) urlheader = { 'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', 'Proxy-Connection': 'keep-alive', 'Pragma': 'no-cache', 'Cache-Control': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'DNT': '1', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-SG,zh;q=0.9,zh-CN;q=0.8,en;q=0.7,zh-TW;q=0.6' }
def test_performance(): n = 100000 p = 0.001 # create set of strings to use strings = set() string_size = 50 # make this number higher # if performance test is taking too long while len(strings) < n: string = "" for j in range(string_size): string += chr(random.randint(0, 255)) strings.add(string) # create another set otherstrings = set() while len(otherstrings) < n: string = "" for j in range(string_size): string += chr(random.randint(0, 255)) if string not in strings: otherstrings.add(string) print "[*] Strings created." ### 1) pybloom import pybloom bf1 = pybloom.BloomFilter(capacity=n, error_rate=p) ### 2) pybloomfilter import pybloomfilter bf2 = pybloomfilter.BloomFilter(n, p) ### 3) bloompy import bloompy bf3 = bloompy.BloomFilter(capacity=n, error_rate=p) # add them bfs = [("pybloom", bf1), ("pybloomfilter", bf2), ("bloompy", bf3)] for s in strings: for _, b in bfs: b.add(s) print "[*] Bloom filters to compare performance:\n %s\n\n" % bfs # add all strings for _, bf in bfs: for string in strings: bf.add(string) # then test for collisions # add all strings print "[*] Now testing with %d unique strings and desired error rate of %f" % (n, p) print "[*] Performance results: " for name, bf in bfs: collisions = 0 starttime = time.time() for string in otherstrings: if string in bf: collisions += 1 elapsed = time.time() - starttime error_rate = float(collisions) / float(n) print "%s: %f seconds with error rate = %f" % (name, elapsed, error_rate)