Exemplo n.º 1
0
def worker():
    """Typical Consumer
    Get spider.model.SpiderResult from spider.queue.spider_queue
    and process the spider_result
    """
    spider_log.debug("queue_size: {0}, hash_size: {1}, fetching: {2}".format(spider_queue.qsize(), len(hash_list), fetching))
    if len(hash_list) >= settings.LIMIT or (spider_queue.qsize() == 0 and fetching == 0):
        from spider.model import spider_session
        spider_log.info("limit touched!")
        spider_log.info("stopping spider now.")
        spider_session.close()
        ioloop.IOLoop.current().stop()
    record = yield spider_queue.get()
    url = record.url
    depth = record.depth
    try:
        if url_hash(url) in hash_list:
            # just in case...
            return
        # spider_log.debug("queue_size: {0}, hash_size: {1}, fetching: {2}".format(spider_queue.qsize(), len(hash_list), fetching))
        links = yield get_links_from_url(record)
        for link in links:
            if url_hash(link) in hash_list or record.depth >= settings.DEPTH:
                continue
            new_record = SpiderResult(url=urlparse.urljoin(url, link), depth=depth + 1)
            new_record.refer = url
            spider_queue.put(new_record)
    finally:
        spider_queue.task_done()
Exemplo n.º 2
0
def init_db():
    """Initialize Database

    :return: None
    """
    global init_db_time
    init_db_time = datetime.datetime.now()
    cursor = spider_session.cursor()
    sqls = [
        "CREATE TABLE IF NOT EXISTS tbl_spider_result("
        "id INTEGER PRIMARY KEY AUTO_INCREMENT,"
        "url VARCHAR(1000) NOT NULL,"
        "depth INTEGER,"
        "create_time TIMESTAMP,"
        "refer VARCHAR(1000),"
        "content BLOB,"
        "spider_run_time DATETIME,"
        "keyword VARCHAR(100))"
    ]
    map(cursor.execute, sqls)
    spider_session.commit()
    spider_log.info("database initialized")