def run(self): while not global_EXIT: url = "" try: url = Cache.randomKey(cache.unrecognized_contentUrl_dict) if url: website_id = Cache.getDict(cache.unrecognized_contentUrl_dict, url) res = filterContentInfoFunc(website_id, url) if res == SpiderResType.success or res == SpiderResType.alreadyExist: Cache.removeDict(cache.unrecognized_contentUrl_dict, url) Cache.appendList(cache.oldContent_list, url) for i in range(300): if global_EXIT: break time.sleep(1) except Exception as e: log.logMsg(LogType.error, "[FilterContentInfoThread.freshHandler] %s %s" % (url, traceback.format_exc()))
def run(self): while not global_EXIT: website_url = "" if not Cache.dempty(cache.unrecognized_websiteUrl_dict): try: website_id = Cache.randomKey(cache.unrecognized_websiteUrl_dict) if not website_id: for i in range(30): if global_EXIT: break time.sleep(1) continue website_url, xpath = Cache.getDict(cache.unrecognized_websiteUrl_dict, website_id) if (website_id, website_url, xpath): Cache.removeDict(cache.unrecognized_websiteUrl_dict, website_id) except Exception as e: log.logMsg(LogType.error, "[FilterContentUrlThread.unrecognizedHandler] %s %s" % (website_url, traceback.format_exc()))