def test(): runque = RedisQueueConnection('scan').conn size = runque.qsize() print size sleep(1) cnt = 0 if size: while cnt < size: i = runque.get() print i runque.put(i) cnt += 1 runque.flushdb() exit(0) f = open('seeds995k.txt') urls = f.read().strip().split('\n') if size == 0: i = 0 st = time() for url in urls: runque.put(url)
def main(): if len(sys.argv) != 4: print "wrong param" exit(0) port = int(sys.argv[1]) cnt = int(sys.argv[2]) cur = int(sys.argv[3]) ol = 'ret_%s.txt' % (port) f = open(il) i = 0 slist = list() lines = f.read().strip().split('\n') slist = lines[ cur : len(lines) : cnt ] scancnt = len(slist) tmp = "scanips_%s_%s_%s" % (port, cnt, cur) if os.path.isfile(tmp): os.unlink(tmp) f = open(tmp,'w') for ip in slist: f.write(ip + '\n') f.close() cmd = scancmd % (port, tmp, ol) print cmd run(cmd) # when done # make bitmap to store the scanret and then insert into redis print "run done, we collect the ips" tmp = open(ol).read().split('\n')[1:-2] ips = list() for ip in tmp: ips.append(ip.split()[3]) alivecnt = len(ips) print "SCAN: %d ALIVE: %d " % (scancnt, alivecnt) scanque = RedisQueueConnection('scan').conn ipd = dict() for ip in ips: h = calc(ip) ht = ip.split('.')[-1] if h in ipd: ipd[h].append(ht) else: ipd[h] = [ht] for h in ipd: i = [port, h, ipd[h]] item = pickle.dumps(i) scanque.put(item) print "Insert into redis done " print "Total: %d" % (scanque.qsize())
def __init__(self): self.showpercounts = 10 self.timeout = 20 self.poolsize = 100 self.down_pool = Pool(size=self.poolsize) self.run_que = RedisQueueConnection('running').conn self.doneque = RedisQueueConnection('robots').conn self.tempque = Queue() self.done = 1 self.sent = 0 self.quit = False self.err = Error() self.https_enable = 0 self.httpget = self.httpget_requests # down method self.httpget_requests | httpget_curl self.totalnettime = 0 self.totaldownsize = 0 self.starttime = time() self.ip = getip() self.headers = { 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.6', 'Accept-Encoding': 'gzip,deflate', 'Connection': 'close', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36' }
def insert(): runque = RedisQueueConnection('extracturls').conn urls = flist('urlstogetrobots1.txt')[100000:300000] print len(urls) for url in urls: runque.put(url)
def __init__(self, done_que): self.showpercounts = 100 self.timeout = 5 self.starttime = time() self.quit = False self.run_que = RedisQueueConnection('running').conn self.done_que = done_que self.tasks = [] self.done = 1 self.errdone = set() self.err = Error() self.https_enable = 0 self.httpget = self.httpget_requests # down method self.httpget_requests | httpget_curl self.poolsize = 100 self.down_pool = Pool(size=self.poolsize) self.totalnettime = 0 self.totaldownsize = 0 self.ip = getip()
def extract(): icmd = "insert into orignal (dbname, dbid, url, cms, headers, head) values (%s, %s, %s, %s, %s, %s)" runque = RedisQueueConnection('cms').conn size = runque.qsize() print "total:" , size i = 0 while i < size: item = runque.get() #runque.put(item) data = pickle.loads(item) ndata = [] for item in data: if isinstance(item, unicode): item = item.encode('utf8') ndata.append(item) #print ndata if ndata[3]: print ndata[3], ndata[2] cur.execute(icmd , ndata) i += 1 conn.commit() print "done" print runque.qsize()
def run(): runque = RedisQueueConnection('running').conn #########runque.flushdb() size = runque.qsize() print size raw_input('flush runing') runque.flushdb()
def main(): if len(sys.argv) != 4: print "wrong param" exit(0) port = int(sys.argv[1]) cnt = int(sys.argv[2]) cur = int(sys.argv[3]) ol = 'ret_%s.txt' % (port) f = open(il) i = 0 slist = list() lines = f.read().strip().split('\n') slist = lines[cur:len(lines):cnt] scancnt = len(slist) tmp = "scanips_%s_%s_%s" % (port, cnt, cur) if os.path.isfile(tmp): os.unlink(tmp) f = open(tmp, 'w') for ip in slist: f.write(ip + '\n') f.close() cmd = scancmd % (port, tmp, ol) print cmd run(cmd) # when done # make bitmap to store the scanret and then insert into redis print "run done, we collect the ips" tmp = open(ol).read().split('\n')[1:-2] ips = list() for ip in tmp: ips.append(ip.split()[3]) alivecnt = len(ips) print "SCAN: %d ALIVE: %d " % (scancnt, alivecnt) scanque = RedisQueueConnection('scan').conn ipd = dict() for ip in ips: h = calc(ip) ht = ip.split('.')[-1] if h in ipd: ipd[h].append(ht) else: ipd[h] = [ht] for h in ipd: i = [port, h, ipd[h]] item = pickle.dumps(i) scanque.put(item) print "Insert into redis done " print "Total: %d" % (scanque.qsize())
def daemon(tempque): st = random.random() + 0.5 doneque = RedisQueueConnection('robots').conn while True: if not tempque.empty(): dat = tempque.get() doneque.put(dat) else: sleep(st)
def inserturls(): runque = RedisQueueConnection('extracturls').conn print runque.qsize() raw_input('flushdb?') runque.flushdb() urls = flist('urlstogetip.txt') for url in urls: runque.put(url) print runque.qsize()
def test(): runque = RedisQueueConnection('running').conn size = runque.qsize() print size exit(0) if size == 0: f = open('seeds995k.txt') for c in f: url = c.strip() runque.put(url)
def test(): runque = RedisQueueConnection('scan').conn #########runque.flushdb() size = runque.qsize() t = 0 cnt = 0 port = 0 data = [] if size == 0: return tmp = runque.get() runque.put(tmp) port, iph, ipl = tmp.split(' ') print port, size raw_input('confirm:') while runque.qsize() > 0: tmp = runque.get() try: t += len(tmp.split(' ')[-1].split(',')) data.append(tmp) except: pass cnt += 1 f = "china%s_%s_%s.txt" % (port, size, t) fp = open(f, 'w') for item in data: fp.write(item + '\n') fp.close()
def test(): runque = RedisQueueConnection('robots').conn #########runque.flushdb() size = runque.qsize() item = runque.get() runque.put(item) print pickle.loads(item) print size return raw_input('cofrim') s = flist('urlstogetrobots.txt') for url in s: runque.put(url) print runque.qsize()
def __init__(self, done_que): self.cnt = 0 self.showpercounts = 100 self.dbsize = 0 self.dbsizelimit = 536870912 # 512M 536870912 self.spend = 0 #queue for daemon recieve downloaded websites info self.done_que = done_que #urls queue to put filtered urls extracted from the webpage self.urls_que = RedisQueueConnection('extracturls').conn self.urlfilter = Filter() self.ip = getip() self.fname = self.getdbname() self.conn = sqlite3.connect(self.fname) self.conn.execute( "create table if not exists mainpages (id integer primary key autoincrement, url TEXT,headers TEXT,content BLOB)" ) #when recv the ctrl+c signal, run out the extractation jobs and then quit self.quit = False
def __init__(self, done_que): self.cnt = 0 self.showpercounts = 100 self.dbsize = 0 self.dbsizelimit = 536870912 # 512M 536870912 self.spend = 0 #queue for daemon recieve downloaded websites info self.done_que = done_que #urls queue to put filtered urls extracted from the webpage self.urls_que = RedisQueueConnection('extracturls').conn self.urlfilter = Filter() self.ip = getip() self.fname = self.getdbname() self.conn = sqlite3.connect(self.fname) self.conn.execute("create table if not exists mainpages (id integer primary key autoincrement, url TEXT,headers TEXT,content BLOB)") #when recv the ctrl+c signal, run out the extractation jobs and then quit self.quit = False
def show(name): runque = RedisQueueConnection(name).conn cnt = 0 while cnt < runque.qsize(): data = runque.get() runque.put(data) data = pickle.loads(data) seed = data['seed'] data = data['content'].replace('\r', '\n').replace('\n\n','\n').strip() if not data: continue if data.find('<') >= 0: #html page print seed continue robots = data.split('\n') print seed print print "\n".join(robots) print cnt += 1
class Daemon: def __init__(self, done_que): self.cnt = 0 self.showpercounts = 100 self.dbsize = 0 self.dbsizelimit = 536870912 # 512M 536870912 self.spend = 0 #queue for daemon recieve downloaded websites info self.done_que = done_que #urls queue to put filtered urls extracted from the webpage self.urls_que = RedisQueueConnection('extracturls').conn self.urlfilter = Filter() self.ip = getip() self.fname = self.getdbname() self.conn = sqlite3.connect(self.fname) self.conn.execute("create table if not exists mainpages (id integer primary key autoincrement, url TEXT,headers TEXT,content BLOB)") #when recv the ctrl+c signal, run out the extractation jobs and then quit self.quit = False def getdbname(self, create=False): path = "/work/db" tf = "%Y%m%d-%H%M%S" pre = "sitedata" suf = ".db" dbsize = 0 ip = getip() findname = "%s%s" % (ip, suf) if create == True: date = time.strftime(tf, time.localtime()) lastname = "_".join([pre, date, ip]) + suf self.dbsize = 0 print "Create db: ", lastname return os.path.join(path, lastname) fnames = os.listdir(path) last = 0 lastname = "" for fname in fnames: if fname.endswith(findname): fnow = fname.split('_')[1] fnown = int(time.mktime(time.strptime(fnow, tf))) if fnown > last: last = fnown lastname = fname #can not found the newest db file, so create it if not last: date = time.strftime(tf, time.localtime()) lastname = "_".join([pre, date, ip]) + suf print "Create db: ", lastname self.dbsize = 0 else: print "Reuse the last db: ", lastname self.dbsize = os.stat(os.path.join(path, lastname)).st_size return os.path.join(path, lastname) def geturls(self, seed, content): urls = [] returls = [] if not content or len(content) == 0: return [] try: urls = re.findall(self.urlfilter.urlpatern, content) returls = self.urlfilter.filter_urls(seed,urls) except: pass return returls def run(self): #backend job, sleep(2) while True: try: if self.done_que.empty(): if self.quit == True: #the speed to extract urls is more higher than crawler sleep(1) if not self.done_que.empty(): continue print "Daemon run done and quit successfuly" exit(0) #print "Downloaded queue empty, wait crawler ..." sleep(10) continue data = cPickle.loads(self.done_que.get()) seed = data['seed'] content = data['content'] headers = str(data['headers']) urls = self.geturls(seed, content) #put the extracted urls to urls_que for url in urls: self.urls_que.put(url) #use level 1 to compress data , we get enough compress ratio and speed gziphtml = sqlite3.Binary(gzip.zlib.compress(content, 1)) self.dbsize += ( len(gziphtml) + len(seed) + len(headers) ) self.conn.execute("insert into mainpages (url,headers,content) values (?,?,?)", (seed, headers, gziphtml)) self.cnt += 1 if self.cnt % self.showpercounts == 0: self.conn.commit() print "\n%s\n\tExtract done:%d todo:%d size:%dM" % \ (self.ip, self.cnt, self.done_que.qsize(), self.dbsize/1024/1024) if self.dbsize > self.dbsizelimit: self.fname = self.getdbname(True) self.conn.close() self.conn = sqlite3.connect(self.fname) self.conn.execute("create table if not exists mainpages (id integer primary key autoincrement, url TEXT,headers TEXT,content BLOB)") except Exception as e: print e except KeyboardInterrupt: print "Daemon recv quit singal, waiting for queue empty" self.quit = True
#coding:utf8 import re, os import requests from db_driver import db_driver import db_file from redis_inc import RedisQueueConnection import cPickle as pickle import multiprocessing as mp cmsque = RedisQueueConnection('cms').conn pscript = re.compile(u"<script.*?>(.*?)</script>", re.S) pstyle = re.compile(u"<style.*?>(.*?)</style>", re.S) ptitle = re.compile(u"<title>(.*?)</title>", re.S) pby = re.compile(u'powered by (.*?)["\'</>]', re.S) headtag = "</head>" bodytag = "<body>" htmltag = re.compile(r'<[^>]+>', re.S) # check version fllowed by the cms name # dz x2.5 # phpcms v9 # we believe the version follows by cms must contains numbers def body_version(poweredby): l = poweredby.strip().split() if len(l) == 1: return poweredby
from gevent import monkey import sys from gevent.pool import Pool import requests from redis_inc import RedisQueueConnection monkey.patch_all(thread=False) size = 100 pool = Pool(size) runque = RedisQueueConnection('running').conn robotsque = RedisQueueConnection('robots').conn def httpget(url): url = url + "/robots.txt" con = "" try: with gevent.Timeout(2) as timeout: req = requests.get(url,timeout=(2,2)) con = req.content #print url, len(con) req.close() except:
# this script to test the urls.txt in some tests #! /bin/python from pybloomfilter import BloomFilter import sqlite3 import multiprocessing as mp import sqlite3 import zlib import sys, os, getopt import hashlib import time import re import Queue from redis_inc import RedisQueueConnection #!!! db=1 use db1 to store the seeds r = RedisQueueConnection('test').conn cmd = "select id from mainpages" #worker is a singal process for each cpu on each computer def worker(queue, lock, cpuid, outque): cpuurls = set() innerque = Queue.Queue() while queue.qsize() > 0: db = queue.get() lock.acquire() print "CPU-%s:runing : %s" % (cpuid, db) lock.release()
class Daemon: def __init__(self, done_que): self.cnt = 0 self.showpercounts = 100 self.dbsize = 0 self.dbsizelimit = 536870912 # 512M 536870912 self.spend = 0 #queue for daemon recieve downloaded websites info self.done_que = done_que #urls queue to put filtered urls extracted from the webpage self.urls_que = RedisQueueConnection('extracturls').conn self.urlfilter = Filter() self.ip = getip() self.fname = self.getdbname() self.conn = sqlite3.connect(self.fname) self.conn.execute( "create table if not exists mainpages (id integer primary key autoincrement, url TEXT,headers TEXT,content BLOB)" ) #when recv the ctrl+c signal, run out the extractation jobs and then quit self.quit = False def getdbname(self, create=False): path = "/work/db" tf = "%Y%m%d-%H%M%S" pre = "sitedata" suf = ".db" dbsize = 0 ip = getip() findname = "%s%s" % (ip, suf) if create == True: date = time.strftime(tf, time.localtime()) lastname = "_".join([pre, date, ip]) + suf self.dbsize = 0 print "Create db: ", lastname return os.path.join(path, lastname) fnames = os.listdir(path) last = 0 lastname = "" for fname in fnames: if fname.endswith(findname): fnow = fname.split('_')[1] fnown = int(time.mktime(time.strptime(fnow, tf))) if fnown > last: last = fnown lastname = fname #can not found the newest db file, so create it if not last: date = time.strftime(tf, time.localtime()) lastname = "_".join([pre, date, ip]) + suf print "Create db: ", lastname self.dbsize = 0 else: print "Reuse the last db: ", lastname self.dbsize = os.stat(os.path.join(path, lastname)).st_size return os.path.join(path, lastname) def geturls(self, seed, content): urls = [] returls = [] if not content or len(content) == 0: return [] try: urls = re.findall(self.urlfilter.urlpatern, content) returls = self.urlfilter.filter_urls(seed, urls) except: pass return returls def run(self): #backend job, sleep(2) while True: try: if self.done_que.empty(): if self.quit == True: #the speed to extract urls is more higher than crawler sleep(1) if not self.done_que.empty(): continue print "Daemon run done and quit successfuly" exit(0) #print "Downloaded queue empty, wait crawler ..." sleep(10) continue data = cPickle.loads(self.done_que.get()) seed = data['seed'] content = data['content'] headers = str(data['headers']) urls = self.geturls(seed, content) #put the extracted urls to urls_que for url in urls: self.urls_que.put(url) #use level 1 to compress data , we get enough compress ratio and speed gziphtml = sqlite3.Binary(gzip.zlib.compress(content, 1)) self.dbsize += (len(gziphtml) + len(seed) + len(headers)) self.conn.execute( "insert into mainpages (url,headers,content) values (?,?,?)", (seed, headers, gziphtml)) self.cnt += 1 if self.cnt % self.showpercounts == 0: self.conn.commit() print "\n%s\n\tExtract done:%d todo:%d size:%dM" % \ (self.ip, self.cnt, self.done_que.qsize(), self.dbsize/1024/1024) if self.dbsize > self.dbsizelimit: self.fname = self.getdbname(True) self.conn.close() self.conn = sqlite3.connect(self.fname) self.conn.execute( "create table if not exists mainpages (id integer primary key autoincrement, url TEXT,headers TEXT,content BLOB)" ) except Exception as e: print e except KeyboardInterrupt: print "Daemon recv quit singal, waiting for queue empty" self.quit = True
import gevent from gevent import monkey import sys from gevent.pool import Pool import requests from redis_inc import RedisQueueConnection monkey.patch_all(thread=False) size = 100 pool = Pool(size) runque = RedisQueueConnection('running').conn robotsque = RedisQueueConnection('robots').conn def httpget(url): url = url + "/robots.txt" con = "" try: with gevent.Timeout(2) as timeout: req = requests.get(url, timeout=(2, 2)) con = req.content #print url, len(con) req.close() except: pass data = (url, con) cb(data)
class Crawler: def __init__(self, done_que): self.showpercounts = 100 self.timeout = 5 self.starttime = time() self.quit = False self.run_que = RedisQueueConnection('running').conn self.done_que = done_que self.tasks = [] self.done = 1 self.errdone = set() self.err = Error() self.https_enable = 0 self.httpget = self.httpget_requests # down method self.httpget_requests | httpget_curl self.poolsize = 100 self.down_pool = Pool(size=self.poolsize) self.totalnettime = 0 self.totaldownsize = 0 self.ip = getip() #callback function when greenlet of httpget run done def cb_httpget(self, data = None): if not data: return seed, err, headers, content = data if err: self.handle_error(err,seed) return data={'seed':seed,'headers':headers,'content':content} dat = cPickle.dumps(data) #self.done_que.put_nowait(dat) #print "done", seed if self.done % self.showpercounts == 0: self.out(seed) def out(self, seed): spendtime = time() - self.starttime spendtime = 1 if spendtime == 0 else spendtime nowh = str(int(spendtime)/3600)+":" if spendtime>3600 else "" now = "%s%02d:%02d" % (nowh, spendtime%3600/60, spendtime%60 ) print "\n%s\t%s D:%-4d R:%-7d [QPS: %.2f %.2f] %s" % (self.ip, now, (self.done), self.run_que.qsize(), \ self.done/spendtime, self.done/self.totalnettime , str(self.err) ) def run(self): while self.quit == False: try: if self.run_que.qsize() == 0: print "run que empty" sleep(10) continue url = self.run_que.get() self.run_que.put(url) #self.down_pool.apply_cb(self.httpget, (url,), callback=self.cb_httpget) # spawn is more fast? #url = 'http://www.sdust.edu.cn' self.down_pool.spawn(self.httpget, url) self.done += 1 except KeyboardInterrupt: print "Crawler recv quit singal" self.quit = True self.down_pool.join() print "Crawler over, quit" def handle_error(self,e,url): self.err.lasterrurl = url # do not record the err url, but record the least err url to show if e.find('DNSError') > 0 : self.err.dns += 1 #self.err.rdns.append(url) elif e.find('reset') > 0 :#Connection reset self.err.reset += 1 #self.err.rreset.append(url) elif e.find('Max retries') > 0 or e.find('Connection aborted'): # self.err.conntimeout += 1 #self.err.rconntimeout.append(url) elif e.find('refused') > 0: #Connection refused self.err.refuse += 1 #self.err.rrefuse.append(url) else: self.err.others +=1 #self.err.rothers.append(url) print "Error", url, e # requests is better than curl in tests def httpget_requests(self, url): #return data data = None st = time() con = "" e = "" res_headers = "" headers = { 'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.6', 'Accept-Encoding':'gzip,deflate', 'Connection':'close', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36' } res = None done = False try: with gevent.Timeout(3, False) as timeout: #req.max_redirects = 2 res = requests.get(url, headers = headers ) con = res.content res.close() done = True except KeyboardInterrupt: raise except Exception as e: e = str(e) if res: res.close() #as for spawn, no callback , we should call by ourself data = (url, e, None, None) #return url,e,None,None et = time() self.totalnettime += (et-st) #spawn if done: data = (url, e, res.headers, con) self.cb_httpget(data)
ThreadRunning = True serverips = ['127.0.0.1'] MAX_RUNNING_COUNT = 2**19 # 2**19 = 524 288 MAX_QPS = 1000 # 700+ showpercounts = 1000 #print out every when every 1000 querys sent s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) s.bind(('', 5310)) cur = 0 s.settimeout(0.1) done_sites_fname = 'done_sites.bin' bfdone = BloomFilter.open(done_sites_fname) urlsque = RedisQueueConnection('extracturls').conn runque = RedisQueueConnection('running').conn #query count send out, and success query url cont querysent = 0 querysuc = 0 #dnslocalserver = "dns_server_ip" dns_server_died = False def insert_redis(data): #perform extract and reform url to runque #domain format:
#coding:utf8 import re, os import redis_inc from redis_inc import RedisQueueConnection import cPickle as pickle import multiprocessing as mp import requests import mysql_inc cmsque = RedisQueueConnection('robots').conn tempset = redis_inc.RedisConnection('test').conn conn, cur = mysql_inc.gethandler() cms = dict() cms['disallow'] = dict() cd = cms['disallow'] cd['dedecms'] = ['ad_js.php', 'mytag_js.php', 'feedback_js.php'] cd['phpcms'] = ['phpcms', 'phpsso_server'] cd['wordpress'] = ['wp-admin', 'wp-content', 'wp-includes'] cd['xiaocms'] = ['Print.aspx'] cd['discuz'] = ['forum.php?mod='] cd['yiqicms'] = ['captcha'] cd['ecshop'] = ['goods_script.php'] cd['empirecms'] = ['e/enews'] # orinal: /e/enew/ forks = list() for key in cms.keys(): disk = cms[key] for cmsk in disk.keys():
def getsize(name): runque = RedisQueueConnection(name).conn print runque.qsize() i = runque.get() runque.put(i) print i
class Crawler: def __init__(self): self.showpercounts = 10 self.timeout = 20 self.poolsize = 100 self.down_pool = Pool(size=self.poolsize) self.run_que = RedisQueueConnection('running').conn self.doneque = RedisQueueConnection('robots').conn self.tempque = Queue() self.done = 1 self.sent = 0 self.quit = False self.err = Error() self.https_enable = 0 self.httpget = self.httpget_requests # down method self.httpget_requests | httpget_curl self.totalnettime = 0 self.totaldownsize = 0 self.starttime = time() self.ip = getip() self.headers = { 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.6', 'Accept-Encoding': 'gzip,deflate', 'Connection': 'close', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36' } #callback function when greenlet of httpget run done def cb_httpget(self, data=None): if not data: return seed, err, headers, content = data if err: self.handle_error(err, seed) return if len(content) <= 0: return data = {'seed': seed, 'headers': headers, 'content': content} #content is robots.txt, normally it's pure text dat = cPickle.dumps(data) self.tempque.put(dat) self.done += 1 if self.done % self.showpercounts == 0: self.out(seed) def out(self, seed): spendtime = time() - self.starttime spendtime = 1 if spendtime == 0 else spendtime print "\n%s D:%-4d DT: %4d R:%-7d [QPS: %.2f %.2f] %s" % (self.ip, self.done,self.doneque.qsize(), self.run_que.qsize(), \ self.done/spendtime, self.done/self.totalnettime , str(self.err) ) def run(self): while self.quit == False: try: if self.run_que.qsize() == 0: print "run que empty" sleep(60) url = self.run_que.get() self.down_pool.spawn(self.httpget, url) self.sent += 1 except KeyboardInterrupt: print "Crawler recv quit singal" self.quit = True self.down_pool.join() print "Crawler over, quit" def handle_error(self, e, url): self.err.lasterrurl = url # do not record the err url, but record the least err url to show if e.find('DNSError') > 0: self.err.dns += 1 #self.err.rdns.append(url) elif e.find('reset') > 0: #Connection reset self.err.reset += 1 #self.err.rreset.append(url) elif e.find('Max retries') > 0 or e.find('Connection aborted'): # self.err.conntimeout += 1 #self.err.rconntimeout.append(url) elif e.find('refused') > 0: #Connection refused self.err.refuse += 1 #self.err.rrefuse.append(url) else: self.err.others += 1 #self.err.rothers.append(url) print "Error", url, e # requests is better than curl in tests def httpget_requests(self, url): #return data data = None st = time() con = "" e = "" res_headers = "" res = None done = False try: with gevent.Timeout(self.timeout, False) as timeout: url = url + '/robots.txt' res = requests.get(url, headers=self.headers) if res.status_code == 200: con = res.content done = True res.close() except KeyboardInterrupt: raise except Exception as e: e = str(e) if res: res.close() data = (url, e, None, None) et = time() self.totalnettime += (et - st) #spawn if done: data = (url, e, res.headers, con) #self.cb_httpget(data) if not data: return seed, err, headers, content = data if err: self.handle_error(err, seed) return if len(content) <= 0: return data = {'seed': seed, 'headers': headers, 'content': content} #content is robots.txt, normally it's pure text dat = cPickle.dumps(data) self.tempque.put(dat) self.done += 1 if self.done % self.showpercounts == 0: #self.out(seed) spendtime = time() - self.starttime spendtime = 1 if spendtime == 0 else spendtime print "\n%s D:%-4dDT:%4d R:%-7d [QPS: %.2f %.2f] %s" % (self.ip, self.done,self.doneque.qsize(), self.run_que.qsize(), \ self.done/spendtime, self.sent/spendtime , str(self.err) )
def rmdb(test): runque = RedisQueueConnection(test).conn print runque.qsize() raw_input('yes?') runque.flushdb()