def initWebsite_delay_dict(self, record): """ @summary: 初始化网站的等待更新时间 :param record: 网站记录(id, url, xpath, delay_time) :return: """ if not Cache.keyExist(cache.websiteDelay_dict, record[0]): Cache.setDict(cache.websiteDelay_dict, record[0], record[-1])
def run(self): while not global_EXIT: url = "" try: website_id, url = Cache.getQueue(cache.freshContentUrl_queue, False) res = filterContentInfoFunc(website_id, url) if res == SpiderResType.success or res == SpiderResType.alreadyExist: Cache.appendList(cache.oldContent_list, url) else: Cache.setDict(cache.unrecognized_contentUrl_dict, url, website_id) except Exception as e: if type(e) is not queue.Empty: log.logMsg(LogType.error, "[FilterContentInfoThread] %s %s" % (url, traceback.format_exc()))
def run(self): while not global_EXIT: website_url = "" try: website_id, website_url, xpath = Cache.getQueue(cache.websiteUrl_queue, False) if not filterContentUrlFunc(website_id, website_url, xpath): Cache.setDict(cache.unrecognized_websiteUrl_dict, website_id, (website_url, xpath)) except Exception as e: if type(e) is not queue.Empty: log.logMsg(LogType.error, "[FilterContentUrlThread.freshHandler] %s %s"%(website_url, traceback.format_exc())) else: for i in range(10): if global_EXIT: break time.sleep(1)
def resetDelay_time(): """ @summary: 重置各网站的爬取延迟 """ db = None try: db = mysql.Mysql() for website_id in Cache.keys(cache.websiteDelay_dict): record = Cache.getDict(cache.websiteDelay_dict, website_id) Cache.setDict(cache.websiteDelay_dict, website_id, (record[0], 0)) db.saveDelay_time(website_id, 0) except Exception as e: log.logMsg(LogType.error, "[saveWebsiteDelaytime] %s" % (repr(e))) finally: if db: db.dispose()
def main(): thread_count = 3 pre_threads = [] initdb() # 初始化redis数据库 initGlobalArgs() initContentUrl_dict() # 初始化去重表 log_thread = log.LogThread() # 启动日志记录线程 log_thread.start() QueryWebsiteUrl_thread = QueryWebsiteUrlThread() # 启动读取网站地址线程 QueryWebsiteUrl_thread.start() pre_threads.append(QueryWebsiteUrl_thread) filterContentUrl_thread = FilterContentUrlThread() # 启动爬取内容地址线程 filterContentUrl_thread.start() pre_threads.append(filterContentUrl_thread) for i in range(thread_count): thread = FilterContentInfoThread() thread.start() pre_threads.append(thread) unrecognizedWebsiteUrl_thread = UnrecognizedWebsiteUrl_Thread() unrecognizedWebsiteUrl_thread.start() pre_threads.append(unrecognizedWebsiteUrl_thread) unrecognizedContentUrl_thread = UnrecognizedContentUrl_Thread() unrecognizedContentUrl_thread.start() pre_threads.append(unrecognizedContentUrl_thread) while not global_EXIT: pass time.sleep(5) saveWebsiteDelaytime() # 保存各网站的延迟时间 for t in pre_threads: t.join() log.logMsg(LogType.success, "--------------------bye---------------------\n") while not Cache.qempty(cache.log_queue): pass # 等待把所有日志写到文件中 Cache.setDict(cache.globalArgs_dict, "LogThread_EXIT", True) log_thread.join() if db: db.dispose()
def initGlobalArgs(): """ @summary: 初始化全局变量 """ Cache.setDict(cache.globalArgs_dict, "LogThread_EXIT", False) Cache.setDict(cache.globalArgs_dict, "global_EXIT", False)
log.logMsg(LogType.error, "[saveWebsiteDelaytime] %s" % (repr(e))) finally: if db: db.dispose() def command(cmd): cmd = cmd.lower() if cmd == "delay-time" or cmd == "dt": show_delay_time() elif cmd == "content-count" or cmd == "cc": content_count() elif cmd == "reset-delay-time" or cmd == "rdt": resetDelay_time() if __name__ == '__main__': print("* Started WeNeW_Spider-programe...") thread = threading.Thread(target=main) thread.setDaemon(True) thread.start() while True: cmd = input(">>") if cmd.lower() == "stop": global_EXIT = True Cache.setDict(cache.globalArgs_dict, "global_EXIT", True) print("* Waiting for the WeNeW_Spider-programe to end...") thread.join() print("* WeNeW_Spider-programe closed successfully!") break else: command(cmd)
def incrDelay_time(website_id, timeout): """ @summary: 对网站增加timeout个时间延迟 """ record = Cache.getDict(cache.websiteDelay_dict, website_id) Cache.setDict(cache.websiteDelay_dict, website_id, int(record) + timeout)