def worker(): """Typical Consumer Get spider.model.SpiderResult from spider.queue.spider_queue and process the spider_result """ spider_log.debug("queue_size: {0}, hash_size: {1}, fetching: {2}".format(spider_queue.qsize(), len(hash_list), fetching)) if len(hash_list) >= settings.LIMIT or (spider_queue.qsize() == 0 and fetching == 0): from spider.model import spider_session spider_log.info("limit touched!") spider_log.info("stopping spider now.") spider_session.close() ioloop.IOLoop.current().stop() record = yield spider_queue.get() url = record.url depth = record.depth try: if url_hash(url) in hash_list: # just in case... return # spider_log.debug("queue_size: {0}, hash_size: {1}, fetching: {2}".format(spider_queue.qsize(), len(hash_list), fetching)) links = yield get_links_from_url(record) for link in links: if url_hash(link) in hash_list or record.depth >= settings.DEPTH: continue new_record = SpiderResult(url=urlparse.urljoin(url, link), depth=depth + 1) new_record.refer = url spider_queue.put(new_record) finally: spider_queue.task_done()
def __init__(self, settings): init_log(settings) init_queue() init_db() spider_log.debug("delay: %s" % settings.DELAY) spider_log.debug("limit: %s" % settings.LIMIT) self.settings = settings
def get_links_from_url(spider_result): """Get url links in page spider_result.url :param spider_result: spider.model.SpiderResult :return: list, a list of formatted urls """ global fetching url = spider_result.url client = httpclient.AsyncHTTPClient() # parse HEADER in settings try: header = settings.HEADER except AttributeError: header = {} request = httpclient.HTTPRequest(url=url, headers=header, follow_redirects=True, max_redirects=5, request_timeout=30) # parse KEYWORD in settings try: keyword = settings.KEYWORD if not isinstance(keyword, str): keyword = "" except AttributeError: keyword = "" try: fetching += 1 response = yield client.fetch(request) fetching -= 1 spider_log.debug(u"{0} fetching {1}".format(response.request_time, url)) # decode the body if response if gzip html = response.body if isinstance(response.body, str) else response.body.decode() # get links doc = fromstring(html) links = filter(None, map(url_formatter, doc.xpath("//a/@href"))) if keyword and html.find(keyword) == -1: # keyword set in settings # but page do not contains the keyword # do nothing pass else: # compress response body spider_result.content = zlib.compress(html) try: insert_record(spider_result) hash_list.append(url_hash(url)) except DataError: spider_log.warn("insert fail: {0}".format(url)) raise gen.Return(list(set(links))) except httpclient.HTTPError: fetching -= 1 spider_log.debug(u"failed fetching {0}".format(url)) raise gen.Return([])