def crawl(url, currentDepth, countUrls): redisCon = Redis(host=conf.REDIS_HOST, port=conf.REDIS_PORT, password=conf.REDIS_PASSWD) try: headers = dict() headers[HTTP_HEADER.USER_AGENT] = randomUserAgents() response = requests.get(url, timeout=10, headers=headers) # crawlMsg = 'crawled %s depth: %d count: %d' % (url, currentDepth, countVisitedUrls) # logger.log(CUSTOM_LOGGING.SYSINFO, crawlMsg) content = response.text kb.pageEncoding = response.encoding conf.cookie = str(response.cookies.get_dict()) hashData = hashUrl(url) redisCon.sadd('visited', hashData) redisCon.lpush('visitedList', url) getDB().insert({'url':url, 'depth': currentDepth, 'count':countUrls}) except Exception, ex: logger.log(CUSTOM_LOGGING.ERROR, ex) # print traceback.print_exc() return
def crawlerThread(): global countVisitedUrls while visitQueue.qsize() > 0: url = visitQueue.get() try: hashData = hashUrl(url) if hashData not in visited: headers[HTTP_HEADER.USER_AGENT] = randomUserAgents() response = requests.get(url, timeout=10, headers=headers) crawlMsg = 'crawled %s depth: %d count: %d' % (url, currentDepth, countVisitedUrls) logger.log(CUSTOM_LOGGING.SYSINFO, crawlMsg) content = response.text kb.pageEncoding = response.encoding conf.cookie = str(response.cookies.get_dict()) try: lock.acquire() visited.add(hashData) countVisitedUrls += 1 fp.write(url + '\n') lock.release() except Exception, ex: logger.log(CUSTOM_LOGGING.ERROR, ex) if lock.locked(): lock.release() continue else: continue
def __init__(self): self.redisCon = Redis(host=conf.REDIS_HOST, port=conf.REDIS_PORT, password=conf.REDIS_PASSWD) self.jobQueue = Queue(connection=self.redisCon) map(lambda key: self.redisCon.delete(key), [key for key in self.redisCon.keys() if re.search('visit|rq:', key, re.I)]) hashData = hashUrl(conf.CRAWL_SITE) self.redisCon.lpush('visit', conf.CRAWL_SITE) self.redisCon.sadd('visitSet', hashData)
def __init__(self): self.redisCon = Redis(host=conf.REDIS_HOST, port=conf.REDIS_PORT, password=conf.REDIS_PASSWD) self.jobQueue = Queue(connection=self.redisCon) map(lambda key: self.redisCon.delete(key), [ key for key in self.redisCon.keys() if re.search('visit|rq:', key, re.I) ]) hashData = hashUrl(conf.CRAWL_SITE) self.redisCon.lpush('visit', conf.CRAWL_SITE) self.redisCon.sadd('visitSet', hashData)
def crawl(target): visited = set() visitQueue = Queue() visitQueue.put(target) hashData = hashUrl(target) visitSet.add(hashData) fp = codecs.open(paths.PENEWORK_ROOT_PATH+'/'+conf.STORE_FILENAME, 'w', 'utf-8') lock = Lock() headers = dict() currentDepth = 0 def crawlerThread(): global countVisitedUrls while visitQueue.qsize() > 0: url = visitQueue.get() try: hashData = hashUrl(url) if hashData not in visited: headers[HTTP_HEADER.USER_AGENT] = randomUserAgents() response = requests.get(url, timeout=10, headers=headers) crawlMsg = 'crawled %s depth: %d count: %d' % (url, currentDepth, countVisitedUrls) logger.log(CUSTOM_LOGGING.SYSINFO, crawlMsg) content = response.text kb.pageEncoding = response.encoding conf.cookie = str(response.cookies.get_dict()) try: lock.acquire() visited.add(hashData) countVisitedUrls += 1 fp.write(url + '\n') lock.release() except Exception, ex: logger.log(CUSTOM_LOGGING.ERROR, ex) if lock.locked(): lock.release() continue else: continue except Exception, ex: logger.log(CUSTOM_LOGGING.ERROR, ex) print traceback.print_exc() continue
def start(self): initDB() countDepth = 0 countUrls = 0 while countDepth <= int(conf.CRAWL_DEPTH): while True: # wait for 10 minites # print 'len visite:', self.redisCon.llen('visit') # print 'len visited:', self.redisCon.scard('visited') url = self.redisCon.lpop('visit') if url: countUrls += 1 print 'countDepth:', countDepth, 'countUrls:', countUrls self.jobQueue.enqueue_call(crawl, args=(url, countDepth, countUrls)) else: self.redisCon.delete('visitSet') break while True: # wait 30 seconds, if timeout, jobqueue is empty(except failed job) keyUrl = self.redisCon.blpop('tmpVisit', timeout=30) if keyUrl: url = keyUrl[1] hashData = hashUrl(url) if not self.redisCon.sismember('visited', hashData) and \ not self.redisCon.sismember('visitSet', hashData): self.redisCon.lpush('visit', url) self.redisCon.sadd('visitSet', hashData) else: break countDepth += 1
for form in forms: formMsg = '%s has form, url: %s method: %s data: %s' % (url, form[0], form[1], form[2]) logger.log(CUSTOM_LOGGING.WARNING, formMsg) try: lock.acquire() fp.write(formMsg + '\n') lock.release() except Exception, ex: logger.log(CUSTOM_LOGGING.ERROR, ex) if lock.locked(): lock.release() while conf.CRAWL_DEPTH >= currentDepth: links = Queue() # runThreads(conf.numThreads, crawlerThread) crawlerThread() visitSet = set() while links.qsize() > 0: tmpUrl = links.get() hashData = hashUrl(tmpUrl) if hashData not in visited and hashData not in visitSet: visitQueue.put(tmpUrl) visitSet.add(hashData) currentDepth += 1 fp.close()