def getpagecomments(self, params): info = params.customized['query'] xpath = XPathUtility(html=params.content) hrefs = xpath.xpath('//*[@class="sosResult"]/strong/a/@href') titles = xpath.getlist('//*[@class="sosResult"]/strong/a') pubtimes = xpath.xpath('//*[@class="sosResult"]/span/cite[3]') today = datetime.datetime.strptime( TimeUtility.getcurrentdate(), TimeUtility.DATE_FORMAT_DEFAULT).date() urllist = [] for index in range(0, len(titles), 1): # 标题中包含指定要查询的关键字 # if titles[index].find(info) > -1: if Common.checktitle(info, titles[index]): pubtimestr = TimeUtility.getuniformtime( pubtimes[index].text).split(' ')[0] pubtime = datetime.datetime.strptime( pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT).date() # pubtime = datetime.datetime.strptime(pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT) inteveral = today - pubtime # 时间在指定周期内 if inteveral.days <= int(self.querylastdays): newurl = self.preprocess(hrefs[index]) if newurl is not None: urllist.append(newurl) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
def step3tt(self, params): try: jsondata = json.loads(params.content) if jsondata: publishlist = [ TimeUtility.getcurrentdate(TimeUtility.DEFAULTFORMAT) ] try: if jsondata == "ERROR_PARAMETER": return entitylist = jsondata['resultDO'].get('entityList', []) for comment in entitylist: content = self.strfilter(comment['body']) #Jul 3, 2017 4:46:30 PM curtime = comment['replyTime'] #此处时间格式 curtime = time.strftime( '%Y-%m-%d %H:%M:%S', time.strptime(curtime, '%b %d, %Y %H:%M:%S %p')) nick = comment['userName'] publishlist.append(curtime) if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) except: Logger.printexception() Logger.getlogging().error( 'extract no comment from {site}'.format( site=params.url)) if not self.isnewesttime(params.originalurl, min(publishlist)): return False return True except: Logger.printexception() Logger.getlogging().error( 'extract comment error from {site}'.format(site=params.url))
def process(self, proparam): Logger.getlogging().info(proparam.url) try: if proparam.step is rayliComments.STEP_1: # 取得html中的commentType articleId = re.findall( '^http://bbs\.rayli\.com\.cn/gallery-(\d+)-\d+.html', proparam.url).__getitem__(0) #取得评论url comments_url = rayliComments.COMMENTS_URL % (articleId, 1) self.storeurl(comments_url, proparam.originalurl, rayliComments.STEP_2, { 'articleId': articleId, }) elif proparam.step == rayliComments.STEP_2: articleId = proparam.customized['articleId'] # 取得评论个数 comments_count = float( re.findall(ur'回复:</span> (\d+)</div>', proparam.content).__getitem__(0)) if int(comments_count) == 0: return # 判断增量 cmtnum = CMTStorage.getcount(proparam.originalurl, True) if cmtnum >= comments_count: return NewsStorage.setcmtnum(proparam.originalurl, comments_count) page_num = int( math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages # 循环取得评论的url for page in range(1, page_num + 1, 1): # 取得评论的url url = rayliComments.COMMENTS_URL % (articleId, page) self.storeurl(url, proparam.originalurl, rayliComments.STEP_3) elif proparam.step == rayliComments.STEP_3: commentsInfo = [] soup = BeautifulSoup(proparam.content, 'html.parser') # 获取评论 comments = soup.select('.t_f') # 获取评论时间 commentTime = self.r.parse( ur'<em id="authorposton\d+">发表于 (.+?)</em>', proparam.content) # 获取nick nicks = soup.select('.xw1') # 是否首页 page = int( self.r.parse(ur'page=1-page-(\d+)', proparam.url)[0]) if page == 1: index = 1 else: index = 0 publishlist = [ TimeUtility.getcurrentdate(TimeUtility.DEFAULTFORMAT) ] if len(comments) > 0: # 获取评论 for index in range(index, len(comments), 1): content = comments[index].text.strip() curtime = commentTime[index] nick = nicks[index].text publishlist.append(curtime) if not CMTStorage.exist(proparam.originalurl, content, curtime, nick): CMTStorage.storecmt(proparam.originalurl, content, curtime, nick) # cmti = CommentInfo() # if URLStorage.storeupdatetime(proparam.originalurl, commentTime[index]): # cmti.content = comments[index].text # commentsInfo.append(cmti) if len(publishlist) > 0: publishdate = min(publishlist) NewsStorage.setpublishdate(proparam.originalurl, publishdate) # # 保存获取的评论 # if len(commentsInfo) > 0: # self.commentstorage.store(proparam.originalurl, commentsInfo) else: Logger.getlogging().error("proparam.step == %d", proparam.step) except Exception, e: traceback.print_exc()