def step2bbs(self, params): Logger.getlogging().info("LaohuComments.STEP_2_BBS") if self.r.parse('^http://bbs\.laohu\.com\/\w+-(\d+)-\d+-\d+\.html',params.originalurl): #S1 field = params.customized['field'] # 通过xpath, 从页面上获取页面总数 # lastpg = CommenComments.gettotalpages(params.content) lastpg = int(self.r.parse('<span title=".*">(.*?)</span>',params.content)[0].split('/')[1].split(' ')[1]) if lastpg is None: return # 当前评论页码 pg = self.r.parse(self.BBS_URL_REG, params.url)[0] # 获取当前页评论 params.customized['lastpg'] = lastpg CommenComments.getpagecomments(self, params, self.BBS_URL_REG) # 如果只有1页,后续处理不要 if int(lastpg) == 1: return # 对于S1, 需要展开获取所有评论 urlArr = params.originalurl.split('-') if len(urlArr) != 4: return for page in range(1, lastpg + 1, 1): if page == int(pg): continue commentUrl = urlArr[0] + '-' + urlArr[1] + '-' + str(page) + '-' + urlArr[3] Logger.getlogging().debug(commentUrl) self.storeurl(commentUrl, params.originalurl, LaohuComments.STEP_3_BBS, {'field': field, 'lastpg': lastpg}) else: #特殊网址 CommenComments.getpagecomments2(self, params)
def step3bbs(self, params): Logger.getlogging().info("LaohuComments.STEP_3_BBS") CommenComments.getpagecomments(self, params, self.BBS_URL_REG)