def run(self): while not global_EXIT: url = "" try: website_id, url = Cache.getQueue(cache.freshContentUrl_queue, False) res = filterContentInfoFunc(website_id, url) if res == SpiderResType.success or res == SpiderResType.alreadyExist: Cache.appendList(cache.oldContent_list, url) else: Cache.setDict(cache.unrecognized_contentUrl_dict, url, website_id) except Exception as e: if type(e) is not queue.Empty: log.logMsg(LogType.error, "[FilterContentInfoThread] %s %s" % (url, traceback.format_exc()))
def run(self): while not global_EXIT: website_url = "" try: website_id, website_url, xpath = Cache.getQueue(cache.websiteUrl_queue, False) if not filterContentUrlFunc(website_id, website_url, xpath): Cache.setDict(cache.unrecognized_websiteUrl_dict, website_id, (website_url, xpath)) except Exception as e: if type(e) is not queue.Empty: log.logMsg(LogType.error, "[FilterContentUrlThread.freshHandler] %s %s"%(website_url, traceback.format_exc())) else: for i in range(10): if global_EXIT: break time.sleep(1)
def run(self): while not Cache.getDict(cache.globalArgs_dict, "LogThread_EXIT"): try: info = Cache.getQueue(cache.log_queue, False) if os.path.exists(self.getFilename()): log_size = os.path.getsize( self.getFilename()) / 1024 / 1024 # 日志大小超过1M时另建新的日志文件 if log_size > 1: self.index += 1 with open(self.getFilename(), 'a') as f: info += '<%s>\n' % ( datetime.datetime.now().strftime("%H:%M:%S")) f.write(info) except Exception as e: if type(e) is not queue.Empty: print("Log Error: %s" % e)