def main(): thread_count = 3 pre_threads = [] initdb() # 初始化redis数据库 initGlobalArgs() initContentUrl_dict() # 初始化去重表 log_thread = log.LogThread() # 启动日志记录线程 log_thread.start() QueryWebsiteUrl_thread = QueryWebsiteUrlThread() # 启动读取网站地址线程 QueryWebsiteUrl_thread.start() pre_threads.append(QueryWebsiteUrl_thread) filterContentUrl_thread = FilterContentUrlThread() # 启动爬取内容地址线程 filterContentUrl_thread.start() pre_threads.append(filterContentUrl_thread) for i in range(thread_count): thread = FilterContentInfoThread() thread.start() pre_threads.append(thread) unrecognizedWebsiteUrl_thread = UnrecognizedWebsiteUrl_Thread() unrecognizedWebsiteUrl_thread.start() pre_threads.append(unrecognizedWebsiteUrl_thread) unrecognizedContentUrl_thread = UnrecognizedContentUrl_Thread() unrecognizedContentUrl_thread.start() pre_threads.append(unrecognizedContentUrl_thread) while not global_EXIT: pass time.sleep(5) saveWebsiteDelaytime() # 保存各网站的延迟时间 for t in pre_threads: t.join() log.logMsg(LogType.success, "--------------------bye---------------------\n") while not Cache.qempty(cache.log_queue): pass # 等待把所有日志写到文件中 Cache.setDict(cache.globalArgs_dict, "LogThread_EXIT", True) log_thread.join() if db: db.dispose()
def run(self): while not global_EXIT: try: if Cache.qempty(cache.websiteUrl_queue): records = mysql.Mysql.queryWebsiteUrl() for record in records: # record: id,url,xpath,detail,delay_time record = [str(item) for item in record] self.initWebsite_delay_dict(record) t = threading.Thread(target=self.putRecord, args=(record,)) t.setDaemon(True) t.start() except Exception as e: log.logMsg(LogType.error, "[QueryWebsiteUrlThread] %s" % (traceback.format_exc())) for i in range(60): if global_EXIT: break time.sleep(1)