Exemplo n.º 1
0
def initContentUrl_dict():
    """
    @summary: 初始化去重列表
    """
    items = mysql.Mysql.queryContentUrl()
    for item in items:
        Cache.appendList(cache.oldContent_list, item[0])
Exemplo n.º 2
0
 def run(self):
     while not global_EXIT:
         url = ""
         try:
             website_id, url = Cache.getQueue(cache.freshContentUrl_queue, False)
             res = filterContentInfoFunc(website_id, url)
             if res == SpiderResType.success or res == SpiderResType.alreadyExist:
                 Cache.appendList(cache.oldContent_list, url)
             else:
                 Cache.setDict(cache.unrecognized_contentUrl_dict, url, website_id)
         except Exception as e:
             if type(e) is not queue.Empty:
                 log.logMsg(LogType.error, "[FilterContentInfoThread] %s %s" % (url, traceback.format_exc()))
Exemplo n.º 3
0
 def putRecord(self, record):
     """
     @summary: 把record添加到正在等待的网站队列中
     """
     website_id, website_url, xpath = record[:3]
     if not Cache.listItemExist(cache.workingWebsite_list, website_id) and \
             not Cache.keyExist(cache.unrecognized_websiteUrl_dict, website_id):
         Cache.appendList(cache.workingWebsite_list, website_id)
         Cache.putQueue(cache.websiteUrl_queue, (website_id, website_url, xpath))
         sleep_time = Cache.getDict(cache.websiteDelay_dict, website_id)
         for i in range(int(sleep_time)):
             if global_EXIT: return
             time.sleep(1)
         Cache.removeList(cache.workingWebsite_list, website_id)
Exemplo n.º 4
0
 def run(self):
     while not global_EXIT:
         url = ""
         try:
             url = Cache.randomKey(cache.unrecognized_contentUrl_dict)
             if url:
                 website_id = Cache.getDict(cache.unrecognized_contentUrl_dict, url)
                 res = filterContentInfoFunc(website_id, url)
                 if res == SpiderResType.success or res == SpiderResType.alreadyExist:
                     Cache.removeDict(cache.unrecognized_contentUrl_dict, url)
                     Cache.appendList(cache.oldContent_list, url)
             for i in range(300):
                 if global_EXIT: break
                 time.sleep(1)
         except Exception as e:
             log.logMsg(LogType.error, "[FilterContentInfoThread.freshHandler] %s %s" % (url, traceback.format_exc()))