def worker(): """Typical Consumer Get spider.model.SpiderResult from spider.queue.spider_queue and process the spider_result """ spider_log.debug("queue_size: {0}, hash_size: {1}, fetching: {2}".format(spider_queue.qsize(), len(hash_list), fetching)) if len(hash_list) >= settings.LIMIT or (spider_queue.qsize() == 0 and fetching == 0): from spider.model import spider_session spider_log.info("limit touched!") spider_log.info("stopping spider now.") spider_session.close() ioloop.IOLoop.current().stop() record = yield spider_queue.get() url = record.url depth = record.depth try: if url_hash(url) in hash_list: # just in case... return # spider_log.debug("queue_size: {0}, hash_size: {1}, fetching: {2}".format(spider_queue.qsize(), len(hash_list), fetching)) links = yield get_links_from_url(record) for link in links: if url_hash(link) in hash_list or record.depth >= settings.DEPTH: continue new_record = SpiderResult(url=urlparse.urljoin(url, link), depth=depth + 1) new_record.refer = url spider_queue.put(new_record) finally: spider_queue.task_done()
def init_db(): """Initialize Database :return: None """ global init_db_time init_db_time = datetime.datetime.now() cursor = spider_session.cursor() sqls = [ "CREATE TABLE IF NOT EXISTS tbl_spider_result(" "id INTEGER PRIMARY KEY AUTO_INCREMENT," "url VARCHAR(1000) NOT NULL," "depth INTEGER," "create_time TIMESTAMP," "refer VARCHAR(1000)," "content BLOB," "spider_run_time DATETIME," "keyword VARCHAR(100))" ] map(cursor.execute, sqls) spider_session.commit() spider_log.info("database initialized")