def removecachefile(): cache = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_TEMPLATE_WORK_DIRECTORY) databackupfolder = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_DATA_BACKUP_PATH) + TimeUtility.getcurrentdate(TimeUtility.TIMESTAMP_FORMAT) if FileUtility.exists(cache): FileUtility.move(cache, databackupfolder) FileUtility.rmdir(cache) limit = int(SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_OUTPUT_PATH_LIMIT)) databackuppath = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN, const.SPIDER_DATA_BACKUP_PATH) if FileUtility.exists(databackuppath): validdate = TimeUtility.getdatebefore(limit, '%Y%m%d000000') for s in os.listdir(databackuppath): fullpath = os.path.join(databackuppath, s) #Logger.getlogging().info('remove cach folder ' + fullpath) #FileUtility.rmdir(fullpath) if s < validdate: fullpath = os.path.join(databackuppath, s) Logger.getlogging().info('remove cach folder ' + fullpath) FileUtility.rmdir(fullpath)
def getpagecomments_step2(self, params): try: page = params.customized['page'] soup = BeautifulSoup(params.content, "html5lib") d_post_content_main = soup.select('#j_p_postlist > div.j_l_post') if page == 1: main_item = d_post_content_main[0] #print main_item pubtimes = '' pubtimesobj = main_item.select('.tail-info') if pubtimesobj: pubtimes = getuniformtime( pubtimesobj[-1].get_text().strip()) else: pubtimeslist = re.findall('\d+-\d+-\d+ \d+:\d+', str(main_item)) if pubtimeslist: pubtimes = getuniformtime(pubtimeslist[0]) if pubtimes: NewsStorage.setpublishdate(params.originalurl, pubtimes) if not compareNow(pubtimes, self.COMMENT_LIMIT_DAYS): Logger.log(params.originalurl, constant.ERRORCODE_WARNNING_NOMATCHTIME) #超过7天的帖子,不在取回复/评论了 return False d_post_content_main = d_post_content_main[1:] comments = [] for item in d_post_content_main: try: comment = item.find( attrs={'id': re.compile("post_content")}) if not comment: continue content = comment.get_text().strip() pubtimes = '' pubtimesobj = item.select('.tail-info') if pubtimesobj: pubtimes = getuniformtime( pubtimesobj[-1].get_text().strip()) else: pubtimeslist = re.findall('\d+-\d+-\d+ \d+:\d+', str(item)) if pubtimeslist: pubtimes = getuniformtime(pubtimeslist[0]) if not pubtimes: if not CMTStorage.exist(params.originalurl, content, TimeUtility.getdatebefore(0), 'nick'): CMTStorage.storecmt(params.originalurl, content, TimeUtility.getdatebefore(0), 'nick') continue #判断评论是否是前一天的 Logger.getlogging().debug(pubtimes) if self.isyestoday(pubtimes): if not CMTStorage.exist(params.originalurl, content, pubtimes, 'nick'): CMTStorage.storecmt(params.originalurl, content, pubtimes, 'nick') except: Logger.printexception() return True except: Logger.printexception() return False
def clear(self): dirlist = os.listdir(self.pucbackpath) for tm in dirlist: if tm < TimeUtility.getdatebefore(self.pucsavedays, TimeUtility.DATE_FORMAT_DEFAULT): FileUtility.rmdir(os.path.join(self.pucbackpath, tm))