示例#1
0
 def updatedb(self):
     items = SpiderDao().getall()
     if not items:
         return
     validdate = TimeUtility.getuniformdatebefore(SpiderConfigure.getinstance().getvalidperiod())
     removelist = []
     for key in items.keys():
         info = URLCommentInfo.fromstring(items[key])
         if info.timestamp < validdate:
             Logger.getlogging().debug(items[key])
             removelist.append(key)
     SpiderDao().remove(removelist)
示例#2
0
 def __init__(self):
     SiteComments.__init__(self)
     self.page_size = 50
     self.page_size_yunqi = 10
     self.COMMENTS_URL = 'http://coral.qq.com/article/{0}/comment?commentid={1}&reqnum={2}'
     # self.AC_COMMENTS_URL = 'http://ac.qq.com/Community/topicList?targetId={0}&page={1}&type=0&_={2}'
     self.AC_COMMENTS_URL = 'http://ac.qq.com/Community/topicList?targetId={0}&page={1}'
     self.EBOOK_COMMENTS_URL = 'http://ebook.qq.com/{site}/getComment.html?bid={bid}&pageIndex={page}'
     self.YUNQI_COMMENT_URL = 'http://yunqi.qq.com/bk/gdyq/%s-b.html?hot=0&p=%d'
     self.STEP_DEFAULT_VALUE = None
     self.STEP_COMMENT_FIRST_PAGE = 1
     self.STEP_COMMENT_NEXT_PAGE = 2
     self.hasnext = True
     self.cmtlastdays = TimeUtility.getuniformdatebefore(
         int(SpiderConfigure.getinstance().getlastdays()))
     self.comment_maxnum = 5000
 def step2(self, params):
     info = params.customized['query']
     query = Common.urldec(info)
     urllist = []
     soup = BeautifulSoup(params.content, 'html5lib')
     items = soup.select('.cf > li > a.ui-list-ct')
     for item in items:
         try:
             url = item.get('href')
             title = item.get('title')
             if self.checktitle(query, title):
                 #urllist.append(url)
                 self.__storeurl__(url, TimeUtility.getuniformdatebefore(0),
                                   SPIDER_S2_WEBSITE_VIDEO)
             else:
                 Logger.log(params.originalurl,
                            constant.ERRORCODE_WARNNING_NOMATCHTITLE)
         except:
             Logger.printexception()
示例#4
0
 def mkcachedir():
     cache = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_TEMPLATE_WORK_DIRECTORY)
     FileUtility.rmdir(cache)
     FileUtility.mkdirs(cache)
     FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH))
     FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_WAIBU_TEMP_PATH))
     FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_TIEBA_TEMP_PATH))
     FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_URLS_TEMP_PATH))
     FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_DONE_TEMP_PATH))
     FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_JSON_TEMP_PATH))
     FileUtility.mkdirs(Storage.getstoragelocation(const.SPIDER_OUTPUT_TEMP_PATH))
   
     limit = int(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_PATH_LIMIT))
     outputpath = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_PATH)
     if FileUtility.exists(outputpath):
         validdate = TimeUtility.getuniformdatebefore(limit)
         for s in os.listdir(outputpath):
             if s < validdate:
                 fullpath = os.path.join(outputpath, s)
                 FileUtility.rmdir(fullpath)
 def __init__(self):
     self.r = RegexUtility()
     self.website = None
     self.cmtlastdays = TimeUtility.getuniformdatebefore(delta=int(SpiderConfigure.getinstance().getlastdays()))[:10] + u' 00:00:00'
     self.maxpages = int(SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN,
                                                   const.SPIDER_S1_MAX_COMMENT_PAGES))