Пример #1
0
 def initWebsite_delay_dict(self, record):
     """
     @summary: 初始化网站的等待更新时间
     :param record: 网站记录(id, url, xpath, delay_time)
     :return:
     """
     if not Cache.keyExist(cache.websiteDelay_dict, record[0]):
         Cache.setDict(cache.websiteDelay_dict, record[0], record[-1])
Пример #2
0
 def run(self):
     while not global_EXIT:
         url = ""
         try:
             website_id, url = Cache.getQueue(cache.freshContentUrl_queue, False)
             res = filterContentInfoFunc(website_id, url)
             if res == SpiderResType.success or res == SpiderResType.alreadyExist:
                 Cache.appendList(cache.oldContent_list, url)
             else:
                 Cache.setDict(cache.unrecognized_contentUrl_dict, url, website_id)
         except Exception as e:
             if type(e) is not queue.Empty:
                 log.logMsg(LogType.error, "[FilterContentInfoThread] %s %s" % (url, traceback.format_exc()))
Пример #3
0
 def run(self):
     while not global_EXIT:
         website_url = ""
         try:
             website_id, website_url, xpath = Cache.getQueue(cache.websiteUrl_queue, False)
             if not filterContentUrlFunc(website_id, website_url, xpath):
                 Cache.setDict(cache.unrecognized_websiteUrl_dict, website_id, (website_url, xpath))
         except Exception as e:
             if type(e) is not queue.Empty:
                 log.logMsg(LogType.error, "[FilterContentUrlThread.freshHandler] %s %s"%(website_url, traceback.format_exc()))
             else:
                 for i in range(10):
                     if global_EXIT: break
                     time.sleep(1)
Пример #4
0
def resetDelay_time():
    """
    @summary: 重置各网站的爬取延迟
    """
    db = None
    try:
        db = mysql.Mysql()
        for website_id in Cache.keys(cache.websiteDelay_dict):
            record = Cache.getDict(cache.websiteDelay_dict, website_id)
            Cache.setDict(cache.websiteDelay_dict, website_id, (record[0], 0))
            db.saveDelay_time(website_id, 0)
    except Exception as e:
        log.logMsg(LogType.error, "[saveWebsiteDelaytime] %s" % (repr(e)))
    finally:
        if db: db.dispose()
Пример #5
0
def main():
    thread_count = 3
    pre_threads = []

    initdb()                                            # 初始化redis数据库
    initGlobalArgs()
    initContentUrl_dict()                               # 初始化去重表

    log_thread = log.LogThread()                        # 启动日志记录线程
    log_thread.start()

    QueryWebsiteUrl_thread = QueryWebsiteUrlThread()    # 启动读取网站地址线程
    QueryWebsiteUrl_thread.start()
    pre_threads.append(QueryWebsiteUrl_thread)

    filterContentUrl_thread = FilterContentUrlThread()  # 启动爬取内容地址线程
    filterContentUrl_thread.start()
    pre_threads.append(filterContentUrl_thread)

    for i in range(thread_count):
        thread = FilterContentInfoThread()
        thread.start()
        pre_threads.append(thread)

    unrecognizedWebsiteUrl_thread = UnrecognizedWebsiteUrl_Thread()
    unrecognizedWebsiteUrl_thread.start()
    pre_threads.append(unrecognizedWebsiteUrl_thread)

    unrecognizedContentUrl_thread = UnrecognizedContentUrl_Thread()
    unrecognizedContentUrl_thread.start()
    pre_threads.append(unrecognizedContentUrl_thread)


    while not global_EXIT: pass

    time.sleep(5)

    saveWebsiteDelaytime()              # 保存各网站的延迟时间

    for t in pre_threads:
        t.join()

    log.logMsg(LogType.success, "--------------------bye---------------------\n")
    while not Cache.qempty(cache.log_queue): pass  # 等待把所有日志写到文件中
    Cache.setDict(cache.globalArgs_dict, "LogThread_EXIT", True)
    log_thread.join()

    if db: db.dispose()
Пример #6
0
def initGlobalArgs():
    """
    @summary:  初始化全局变量
    """
    Cache.setDict(cache.globalArgs_dict, "LogThread_EXIT", False)
    Cache.setDict(cache.globalArgs_dict, "global_EXIT", False)
Пример #7
0
        log.logMsg(LogType.error, "[saveWebsiteDelaytime] %s" % (repr(e)))
    finally:
        if db: db.dispose()


def command(cmd):
    cmd = cmd.lower()
    if cmd == "delay-time" or cmd == "dt":
        show_delay_time()
    elif cmd == "content-count" or cmd == "cc":
        content_count()
    elif cmd == "reset-delay-time" or cmd == "rdt":
        resetDelay_time()


if __name__ == '__main__':
    print("* Started WeNeW_Spider-programe...")
    thread = threading.Thread(target=main)
    thread.setDaemon(True)
    thread.start()
    while True:
        cmd = input(">>")
        if cmd.lower() == "stop":
            global_EXIT = True
            Cache.setDict(cache.globalArgs_dict, "global_EXIT", True)
            print("* Waiting for the WeNeW_Spider-programe to end...")
            thread.join()
            print("* WeNeW_Spider-programe closed successfully!")
            break
        else:
            command(cmd)
Пример #8
0
def incrDelay_time(website_id, timeout):
    """
    @summary: 对网站增加timeout个时间延迟
    """
    record = Cache.getDict(cache.websiteDelay_dict, website_id)
    Cache.setDict(cache.websiteDelay_dict, website_id, int(record) + timeout)