def test(): runque = RedisQueueConnection('scan').conn #########runque.flushdb() size = runque.qsize() t = 0 cnt = 0 port = 0 data = [] if size == 0: return tmp = runque.get() runque.put(tmp) port, iph, ipl = tmp.split(' ') print port, size raw_input('confirm:') while runque.qsize() > 0: tmp = runque.get() try: t += len(tmp.split(' ')[-1].split(',')) data.append(tmp) except: pass cnt += 1 f = "china%s_%s_%s.txt" % (port, size, t) fp = open(f, 'w') for item in data: fp.write(item + '\n') fp.close()
def main(): if len(sys.argv) != 4: print "wrong param" exit(0) port = int(sys.argv[1]) cnt = int(sys.argv[2]) cur = int(sys.argv[3]) ol = 'ret_%s.txt' % (port) f = open(il) i = 0 slist = list() lines = f.read().strip().split('\n') slist = lines[ cur : len(lines) : cnt ] scancnt = len(slist) tmp = "scanips_%s_%s_%s" % (port, cnt, cur) if os.path.isfile(tmp): os.unlink(tmp) f = open(tmp,'w') for ip in slist: f.write(ip + '\n') f.close() cmd = scancmd % (port, tmp, ol) print cmd run(cmd) # when done # make bitmap to store the scanret and then insert into redis print "run done, we collect the ips" tmp = open(ol).read().split('\n')[1:-2] ips = list() for ip in tmp: ips.append(ip.split()[3]) alivecnt = len(ips) print "SCAN: %d ALIVE: %d " % (scancnt, alivecnt) scanque = RedisQueueConnection('scan').conn ipd = dict() for ip in ips: h = calc(ip) ht = ip.split('.')[-1] if h in ipd: ipd[h].append(ht) else: ipd[h] = [ht] for h in ipd: i = [port, h, ipd[h]] item = pickle.dumps(i) scanque.put(item) print "Insert into redis done " print "Total: %d" % (scanque.qsize())
def insert(): runque = RedisQueueConnection('extracturls').conn urls = flist('urlstogetrobots1.txt')[100000:300000] print len(urls) for url in urls: runque.put(url)
def test(): runque = RedisQueueConnection('scan').conn size = runque.qsize() print size sleep(1) cnt = 0 if size: while cnt < size: i = runque.get() print i runque.put(i) cnt += 1 runque.flushdb() exit(0) f = open('seeds995k.txt') urls = f.read().strip().split('\n') if size == 0: i = 0 st = time() for url in urls: runque.put(url)
def main(): if len(sys.argv) != 4: print "wrong param" exit(0) port = int(sys.argv[1]) cnt = int(sys.argv[2]) cur = int(sys.argv[3]) ol = 'ret_%s.txt' % (port) f = open(il) i = 0 slist = list() lines = f.read().strip().split('\n') slist = lines[cur:len(lines):cnt] scancnt = len(slist) tmp = "scanips_%s_%s_%s" % (port, cnt, cur) if os.path.isfile(tmp): os.unlink(tmp) f = open(tmp, 'w') for ip in slist: f.write(ip + '\n') f.close() cmd = scancmd % (port, tmp, ol) print cmd run(cmd) # when done # make bitmap to store the scanret and then insert into redis print "run done, we collect the ips" tmp = open(ol).read().split('\n')[1:-2] ips = list() for ip in tmp: ips.append(ip.split()[3]) alivecnt = len(ips) print "SCAN: %d ALIVE: %d " % (scancnt, alivecnt) scanque = RedisQueueConnection('scan').conn ipd = dict() for ip in ips: h = calc(ip) ht = ip.split('.')[-1] if h in ipd: ipd[h].append(ht) else: ipd[h] = [ht] for h in ipd: i = [port, h, ipd[h]] item = pickle.dumps(i) scanque.put(item) print "Insert into redis done " print "Total: %d" % (scanque.qsize())
def daemon(tempque): st = random.random() + 0.5 doneque = RedisQueueConnection('robots').conn while True: if not tempque.empty(): dat = tempque.get() doneque.put(dat) else: sleep(st)
def inserturls(): runque = RedisQueueConnection('extracturls').conn print runque.qsize() raw_input('flushdb?') runque.flushdb() urls = flist('urlstogetip.txt') for url in urls: runque.put(url) print runque.qsize()
def test(): runque = RedisQueueConnection('running').conn size = runque.qsize() print size exit(0) if size == 0: f = open('seeds995k.txt') for c in f: url = c.strip() runque.put(url)
def test(): runque = RedisQueueConnection('robots').conn #########runque.flushdb() size = runque.qsize() item = runque.get() runque.put(item) print pickle.loads(item) print size return raw_input('cofrim') s = flist('urlstogetrobots.txt') for url in s: runque.put(url) print runque.qsize()
def show(name): runque = RedisQueueConnection(name).conn cnt = 0 while cnt < runque.qsize(): data = runque.get() runque.put(data) data = pickle.loads(data) seed = data['seed'] data = data['content'].replace('\r', '\n').replace('\n\n','\n').strip() if not data: continue if data.find('<') >= 0: #html page print seed continue robots = data.split('\n') print seed print print "\n".join(robots) print cnt += 1
class Daemon: def __init__(self, done_que): self.cnt = 0 self.showpercounts = 100 self.dbsize = 0 self.dbsizelimit = 536870912 # 512M 536870912 self.spend = 0 #queue for daemon recieve downloaded websites info self.done_que = done_que #urls queue to put filtered urls extracted from the webpage self.urls_que = RedisQueueConnection('extracturls').conn self.urlfilter = Filter() self.ip = getip() self.fname = self.getdbname() self.conn = sqlite3.connect(self.fname) self.conn.execute("create table if not exists mainpages (id integer primary key autoincrement, url TEXT,headers TEXT,content BLOB)") #when recv the ctrl+c signal, run out the extractation jobs and then quit self.quit = False def getdbname(self, create=False): path = "/work/db" tf = "%Y%m%d-%H%M%S" pre = "sitedata" suf = ".db" dbsize = 0 ip = getip() findname = "%s%s" % (ip, suf) if create == True: date = time.strftime(tf, time.localtime()) lastname = "_".join([pre, date, ip]) + suf self.dbsize = 0 print "Create db: ", lastname return os.path.join(path, lastname) fnames = os.listdir(path) last = 0 lastname = "" for fname in fnames: if fname.endswith(findname): fnow = fname.split('_')[1] fnown = int(time.mktime(time.strptime(fnow, tf))) if fnown > last: last = fnown lastname = fname #can not found the newest db file, so create it if not last: date = time.strftime(tf, time.localtime()) lastname = "_".join([pre, date, ip]) + suf print "Create db: ", lastname self.dbsize = 0 else: print "Reuse the last db: ", lastname self.dbsize = os.stat(os.path.join(path, lastname)).st_size return os.path.join(path, lastname) def geturls(self, seed, content): urls = [] returls = [] if not content or len(content) == 0: return [] try: urls = re.findall(self.urlfilter.urlpatern, content) returls = self.urlfilter.filter_urls(seed,urls) except: pass return returls def run(self): #backend job, sleep(2) while True: try: if self.done_que.empty(): if self.quit == True: #the speed to extract urls is more higher than crawler sleep(1) if not self.done_que.empty(): continue print "Daemon run done and quit successfuly" exit(0) #print "Downloaded queue empty, wait crawler ..." sleep(10) continue data = cPickle.loads(self.done_que.get()) seed = data['seed'] content = data['content'] headers = str(data['headers']) urls = self.geturls(seed, content) #put the extracted urls to urls_que for url in urls: self.urls_que.put(url) #use level 1 to compress data , we get enough compress ratio and speed gziphtml = sqlite3.Binary(gzip.zlib.compress(content, 1)) self.dbsize += ( len(gziphtml) + len(seed) + len(headers) ) self.conn.execute("insert into mainpages (url,headers,content) values (?,?,?)", (seed, headers, gziphtml)) self.cnt += 1 if self.cnt % self.showpercounts == 0: self.conn.commit() print "\n%s\n\tExtract done:%d todo:%d size:%dM" % \ (self.ip, self.cnt, self.done_que.qsize(), self.dbsize/1024/1024) if self.dbsize > self.dbsizelimit: self.fname = self.getdbname(True) self.conn.close() self.conn = sqlite3.connect(self.fname) self.conn.execute("create table if not exists mainpages (id integer primary key autoincrement, url TEXT,headers TEXT,content BLOB)") except Exception as e: print e except KeyboardInterrupt: print "Daemon recv quit singal, waiting for queue empty" self.quit = True
con = req.content #print url, len(con) req.close() except: pass data = (url, con) cb(data) from time import time def cb(data): seed, con = data #print "\t", seed, len(con) cnt = 0 sst = time() while True: url = runque.get() runque.put(url) st = time() pool.spawn(httpget, url) et = time() cnt += 1 if cnt % 10 == 0: print cnt / (et-sst) ,runque.qsize(), robotsque.qsize()
class Daemon: def __init__(self, done_que): self.cnt = 0 self.showpercounts = 100 self.dbsize = 0 self.dbsizelimit = 536870912 # 512M 536870912 self.spend = 0 #queue for daemon recieve downloaded websites info self.done_que = done_que #urls queue to put filtered urls extracted from the webpage self.urls_que = RedisQueueConnection('extracturls').conn self.urlfilter = Filter() self.ip = getip() self.fname = self.getdbname() self.conn = sqlite3.connect(self.fname) self.conn.execute( "create table if not exists mainpages (id integer primary key autoincrement, url TEXT,headers TEXT,content BLOB)" ) #when recv the ctrl+c signal, run out the extractation jobs and then quit self.quit = False def getdbname(self, create=False): path = "/work/db" tf = "%Y%m%d-%H%M%S" pre = "sitedata" suf = ".db" dbsize = 0 ip = getip() findname = "%s%s" % (ip, suf) if create == True: date = time.strftime(tf, time.localtime()) lastname = "_".join([pre, date, ip]) + suf self.dbsize = 0 print "Create db: ", lastname return os.path.join(path, lastname) fnames = os.listdir(path) last = 0 lastname = "" for fname in fnames: if fname.endswith(findname): fnow = fname.split('_')[1] fnown = int(time.mktime(time.strptime(fnow, tf))) if fnown > last: last = fnown lastname = fname #can not found the newest db file, so create it if not last: date = time.strftime(tf, time.localtime()) lastname = "_".join([pre, date, ip]) + suf print "Create db: ", lastname self.dbsize = 0 else: print "Reuse the last db: ", lastname self.dbsize = os.stat(os.path.join(path, lastname)).st_size return os.path.join(path, lastname) def geturls(self, seed, content): urls = [] returls = [] if not content or len(content) == 0: return [] try: urls = re.findall(self.urlfilter.urlpatern, content) returls = self.urlfilter.filter_urls(seed, urls) except: pass return returls def run(self): #backend job, sleep(2) while True: try: if self.done_que.empty(): if self.quit == True: #the speed to extract urls is more higher than crawler sleep(1) if not self.done_que.empty(): continue print "Daemon run done and quit successfuly" exit(0) #print "Downloaded queue empty, wait crawler ..." sleep(10) continue data = cPickle.loads(self.done_que.get()) seed = data['seed'] content = data['content'] headers = str(data['headers']) urls = self.geturls(seed, content) #put the extracted urls to urls_que for url in urls: self.urls_que.put(url) #use level 1 to compress data , we get enough compress ratio and speed gziphtml = sqlite3.Binary(gzip.zlib.compress(content, 1)) self.dbsize += (len(gziphtml) + len(seed) + len(headers)) self.conn.execute( "insert into mainpages (url,headers,content) values (?,?,?)", (seed, headers, gziphtml)) self.cnt += 1 if self.cnt % self.showpercounts == 0: self.conn.commit() print "\n%s\n\tExtract done:%d todo:%d size:%dM" % \ (self.ip, self.cnt, self.done_que.qsize(), self.dbsize/1024/1024) if self.dbsize > self.dbsizelimit: self.fname = self.getdbname(True) self.conn.close() self.conn = sqlite3.connect(self.fname) self.conn.execute( "create table if not exists mainpages (id integer primary key autoincrement, url TEXT,headers TEXT,content BLOB)" ) except Exception as e: print e except KeyboardInterrupt: print "Daemon recv quit singal, waiting for queue empty" self.quit = True
class Crawler: def __init__(self, done_que): self.showpercounts = 100 self.timeout = 5 self.starttime = time() self.quit = False self.run_que = RedisQueueConnection('running').conn self.done_que = done_que self.tasks = [] self.done = 1 self.errdone = set() self.err = Error() self.https_enable = 0 self.httpget = self.httpget_requests # down method self.httpget_requests | httpget_curl self.poolsize = 100 self.down_pool = Pool(size=self.poolsize) self.totalnettime = 0 self.totaldownsize = 0 self.ip = getip() #callback function when greenlet of httpget run done def cb_httpget(self, data = None): if not data: return seed, err, headers, content = data if err: self.handle_error(err,seed) return data={'seed':seed,'headers':headers,'content':content} dat = cPickle.dumps(data) #self.done_que.put_nowait(dat) #print "done", seed if self.done % self.showpercounts == 0: self.out(seed) def out(self, seed): spendtime = time() - self.starttime spendtime = 1 if spendtime == 0 else spendtime nowh = str(int(spendtime)/3600)+":" if spendtime>3600 else "" now = "%s%02d:%02d" % (nowh, spendtime%3600/60, spendtime%60 ) print "\n%s\t%s D:%-4d R:%-7d [QPS: %.2f %.2f] %s" % (self.ip, now, (self.done), self.run_que.qsize(), \ self.done/spendtime, self.done/self.totalnettime , str(self.err) ) def run(self): while self.quit == False: try: if self.run_que.qsize() == 0: print "run que empty" sleep(10) continue url = self.run_que.get() self.run_que.put(url) #self.down_pool.apply_cb(self.httpget, (url,), callback=self.cb_httpget) # spawn is more fast? #url = 'http://www.sdust.edu.cn' self.down_pool.spawn(self.httpget, url) self.done += 1 except KeyboardInterrupt: print "Crawler recv quit singal" self.quit = True self.down_pool.join() print "Crawler over, quit" def handle_error(self,e,url): self.err.lasterrurl = url # do not record the err url, but record the least err url to show if e.find('DNSError') > 0 : self.err.dns += 1 #self.err.rdns.append(url) elif e.find('reset') > 0 :#Connection reset self.err.reset += 1 #self.err.rreset.append(url) elif e.find('Max retries') > 0 or e.find('Connection aborted'): # self.err.conntimeout += 1 #self.err.rconntimeout.append(url) elif e.find('refused') > 0: #Connection refused self.err.refuse += 1 #self.err.rrefuse.append(url) else: self.err.others +=1 #self.err.rothers.append(url) print "Error", url, e # requests is better than curl in tests def httpget_requests(self, url): #return data data = None st = time() con = "" e = "" res_headers = "" headers = { 'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.6', 'Accept-Encoding':'gzip,deflate', 'Connection':'close', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36' } res = None done = False try: with gevent.Timeout(3, False) as timeout: #req.max_redirects = 2 res = requests.get(url, headers = headers ) con = res.content res.close() done = True except KeyboardInterrupt: raise except Exception as e: e = str(e) if res: res.close() #as for spawn, no callback , we should call by ourself data = (url, e, None, None) #return url,e,None,None et = time() self.totalnettime += (et-st) #spawn if done: data = (url, e, res.headers, con) self.cb_httpget(data)
con = req.content #print url, len(con) req.close() except: pass data = (url, con) cb(data) from time import time def cb(data): seed, con = data #print "\t", seed, len(con) cnt = 0 sst = time() while True: url = runque.get() runque.put(url) st = time() pool.spawn(httpget, url) et = time() cnt += 1 if cnt % 10 == 0: print cnt / (et - sst), runque.qsize(), robotsque.qsize()
def getsize(name): runque = RedisQueueConnection(name).conn print runque.qsize() i = runque.get() runque.put(i) print i