def process(self, params): if params.step == IfengS2Query.IFENG_S2QUERY_FIRST_PAGE: q = params.customized['query'] # html = etree.HTML(params.content) xparser = XPathUtility(params.content) mid_count = xparser.getnumber('//div[@class="serpinfo"]/span/em') count = str(mid_count).strip() querylist = [] # 获取不到,则返回 if count == 0: return elif count > 0: pagenum = int( math.ceil(float(count) / IfengS2Query.DEFAULT_PAGE_SIZE)) if pagenum >= self.maxpages: pagenum = self.maxpages for page in range(1, pagenum + 1, 1): url = IfengS2Query.IFENG_QUERY_TEMPLATE.format(pn=page, q=q) querylist.append(url) self.__storeqeuryurllist__( querylist, IfengS2Query.IFENG_S2QUERY_EACH_PAGE, {'info': q}) elif params.step == IfengS2Query.IFENG_S2QUERY_EACH_PAGE: self.step2(params)
def bbs_step3(self, params): try: xparser = XPathUtility(params.content) page = params.customized['page'] pagecount = params.customized['pagecount'] comments = [] updatetimes = [] nicks = [] contents = xparser.getcomments('//*[@class="read"]') mid_times = xparser.getlist('//td[@class="authorname"]') for times in mid_times: updatetimes.append(self.r.parse(ur'于(\d+-\d+-\d+ \d+:\d+:\d+)留言', times)[0]) nicks.append(self.r.parse(ur'(.*)于', times)[0]) if page == 0: mid_index = 1 elif page > 0: mid_index = 0 comments_number = xparser.getnumber('//*[@id="msgsubject"]/font') if comments_number != 0: for index in range(mid_index, len(contents), 1): curtime = TimeUtility.getuniformtime(updatetimes[index]) content = contents[index] nick = nicks[index].split('于')[0].split('☆')[-1] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) except Exception, e: traceback.print_exc()
def bbs_step2(self, params): try: xparser = XPathUtility(params.content) comment_counts = int(xparser.getnumber('//*[@id="msgsubject"]/font')) if comment_counts == 0: return cmtnum = CMTStorage.getcount(params.originalurl, True) # 判断增量 if cmtnum >= comment_counts: return pagecount = xparser.getnumber('//*[@id="pager_top"]') for page in range(0, pagecount + 1, 1): commentUrl = JjwxcBbsComments.COMMENTS_URL.format(url=params.originalurl, pageno=page) Logger.getlogging().debug(commentUrl) self.storeurl(commentUrl, params.originalurl, JjwxcBbsComments.BBS_NEXT_PAGE, {'page': page,'pagecount':pagecount}) NewsStorage.setcmtnum(params.originalurl, comment_counts) except Exception, e: traceback.print_exc()
def step2bbs(self, params): Logger.getlogging().info("Ea3wcomments.STEP_2") commentinfo_url = params.customized['commentinfo_url'] + "&load=all" xparser = XPathUtility(params.content) comments_count = xparser.getnumber('//div[@class="at-comment"]/a/span') # 保存页面评论量 cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= comments_count: return URLStorage.setcmtnum(params.originalurl, comments_count) self.storeurl(commentinfo_url, params.originalurl, Ea3wcomments.STEP_3_BBS)
def process(self, params): Logger.getlogging().info(params.url) try: if params.step is Dm123BbsComments.STEP_1: xparser = XPathUtility(params.content) #通过第一次传进来的URL判断是否有后续页面 keyvalue = self.r.parse('tid-(.*?).html', params.url)[0] pagecount = xparser.getnumber( '//*[@class="pages"]/div[@class="fl"]') commentinfo_url = params.url self.storeurl(commentinfo_url, params.originalurl, Dm123BbsComments.STEP_2, { 'keyvalue': keyvalue, 'totalpage': pagecount, 'curpage': 1 }) elif params.step == Dm123BbsComments.STEP_2: keyvalue = params.customized['keyvalue'] curpage = params.customized['curpage'] xparser = XPathUtility(params.content) commentsinfo = xparser.getcomments( '//div[contains(@class,"tpc_content")]') commentstime = self.r.parse(ur'\"(\d+-\d+-\d+ \d+:\d+)\">发表于:', params.content) comments = [] for index in range(0, len(commentstime)): cmti = CommentInfo() if URLStorage.storeupdatetime( params.originalurl, TimeUtility.getuniformtime(commentstime[0] + ':00')): # 获取增加的评论(根据时间比较) cmti.content = commentsinfo[index] comments.append(cmti) if len(comments) > 0: self.commentstorage.store(params.originalurl, comments) nextpageList = [keyvalue, "-page-", str(curpage + 1)] nextpage = '' nextpage = nextpage.join(nextpageList) if int(nextpageList[2]) <= int(params.customized['totalpage']): comment_url = Dm123BbsComments.COMMENT_URL.format( page=nextpage) self.storeurl( comment_url, params.originalurl, Dm123BbsComments.STEP_2, { 'keyvalue': nextpageList[0], 'totalpage': params.customized['totalpage'], 'curpage': curpage + 1 }) except Exception, e: traceback.print_exc()
def step1(self, params): """""" #Step1: 通过得到docurl,得到获取评论的url的参数。 #docurl = self.r.parse('^http://bbs\.hupu\.com\/(\d+)', params.originalurl) docurl = self.r.parse('^http[s]{0,1}://bbs\.hupu\.com\/(\d+)', params.originalurl) if docurl: docurl = docurl[0] else: Logger.getlogging().debug( '{url}:20000'.format(url=params.originalurl)) return # 取得正文 xparser = XPathUtility(params.content) #取得页数 pageList = xparser.getcomments('//div[@class="page"]/a') if not pageList: pagenum = 1 elif pageList: pagenum = pageList[-2] else: return if int(pagenum) >= self.maxpages: pagenum = self.maxpages # 评论总数 curcmtnum = xparser.getnumber('//span[@class="browse"]') NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return start = int(dbcmtnum / self.page_size) + 1 end = int(pagenum) if end > start + self.maxpages: start = end - self.maxpages params.customized['page'] = 1 if end == 1: self.step2(params) return if start == 1: self.step2(params) comment_url = self.COMMENT_URL.format(docurl=docurl, page=end) self.storeurl(comment_url, params.originalurl, hupuComments.STEP_1_2, { 'docurl': docurl, 'page': end, 'start': start, 'end': end })
def step1(self, params): """""" #Step2: 根据返回内容,通过xpath 得到搜索结果总数 info = params.customized['query'] xhtml = XPathUtility(html=params.content) video_counts = xhtml.getnumber('//*[@class="act"]/span/em') Logger.getlogging().debug(video_counts) # 获取不到,则返回 if video_counts == 0: return page_count = int( math.ceil(video_counts / DuowanS2Query.DEFAULT_PAGE_SIZE)) if page_count >= self.maxpages: page_count = self.maxpages # 根据上面的page_count数,拼出所有的搜索结果url querylist = [] for page in range(1, page_count + 1, 1): url = DuowanS2Query.QUERY_TEMPLATE.format(key=info, pageno=page) querylist.append(url) self.__storeqeuryurllist__(querylist, DuowanS2Query.S2QUERY_EACH_PAGE, {'query': info})
def step2(self, params): Logger.getlogging().info("Flash8Comments.STEP_2") # 将STEP_1中的docurl传下来 docurl = params.customized['docurl'] xparser = XPathUtility(params.content) commentsinfo = xparser.getstring('//div[@class="page"]/span/font[1]') # 保存页面评论量 cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= int(commentsinfo[0]): return URLStorage.setcmtnum(params.originalurl, int(commentsinfo[0])) # 总数除以page_size,然后加1,可得到评论总页数comments_count pagecount = xparser.getnumber('//*[@class="pg"]/label/span') if pagecount == 0: pagecount = pagecount + 1 for page in range(1, pagecount + 1, 1): comment_url = Flash8Comments.COMMENT_URL.format(docurl=docurl, page=page) self.storeurl(comment_url, params.originalurl, Flash8Comments.STEP_3, {'page': page})
def step2(self, params): Logger.getlogging().info("Dm123NewsComments.STEP_2") classid = params.customized['classid'] id = params.customized['id'] xparser = XPathUtility(params.content) # 评论总数(当评论不满一页时,直接获取到的comments_count为0) comments_count = xparser.getnumber('//div/a[1]/b') # comments_count为0时分两种情况,真的没有评论和有评论 if 0 == comments_count: commentsinfos = xparser.getcomments('//div[@class="rbvalueout"]') commentstimes = xparser.getcomments('//span[@class="rbtime"]') # comments_count重新赋值 comments_count = len(commentsinfos) if 0 == comments_count: return else: # 判断增量 cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= comments_count: return URLStorage.setcmtnum(params.originalurl, comments_count) self.storeurl(params.originalurl, params.originalurl, Dm123NewsComments.STEP_3, {'is_only_one_page': True, 'commentsinfos': commentsinfos, 'commentstimes': commentstimes}) else: # 判断增量 cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= comments_count: return URLStorage.setcmtnum(params.originalurl, comments_count) # 评论页数 page_count = int(math.ceil(float(comments_count) / self.page_size)) for page in range(0, int(page_count), 1): comment_url = Dm123NewsComments.COMMENT_URL.format(page=page, classid=classid, id=id) self.storeurl(comment_url, params.originalurl, Dm123NewsComments.STEP_3, {'is_only_one_page': False})
def process(self, params): Logger.getlogging().info(params.url) try: if params.step is SeventeenKComments.STEP_1: #Step1: 通过得到docurl,得到获取评论的首页url。 #Logger.getlogging().info("proparam.step is None") # 在视频url中取出docurl,^http://v\.ifeng\.com\/\w+\/\w+/\d{6}\/[0-9a-z-]+\.shtml # 取URL中的([0-9a-z-]+)参数,此参数为docurl docurl = self.r.parse( '^http://bbs\.17k\.com\/thread-(\d+)-\d+-1\.html', params.originalurl)[0] #Logger.getlogging().debug(docurl) # 评论首页URL为http://comment.ifeng.com/getv.php?job=1&docurl=([0-9a-z-]+)&p=1 commentinfo_url = 'http://bbs.17k.com/thread-{docurl}-1-1.html'.format( docurl=docurl) self.storeurl(commentinfo_url, params.originalurl, SeventeenKComments.STEP_2, {'docurl': docurl}) elif params.step == SeventeenKComments.STEP_2: #将STEP_1中的docurl传下来 docurl = params.customized['docurl'] # Step2: 通过Step1设置url,得到评论的总数,并根据评论总数得到获取其他评论的url。 #Logger.getlogging().info("params.step == 2") # 打开STEP1中URL,截取"count":num字段,取出num的值,num字段为评论总数 xparser = XPathUtility(params.content) commentsinfo = xparser.getnumber( '//*[@class="hm ptn"]/span[5]') #Logger.getlogging().debug(comments_count / self.page_size) #Logger.getlogging().debug(math.ceil(comments_count / self.page_size)) # 保存页面评论量 cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= int(commentsinfo): return URLStorage.setcmtnum(params.originalurl, int(commentsinfo)) # 总数除以page_size,然后加1,可得到评论总页数comments_count # 循环http://comment.ifeng.com/getv.php?job=1&docurl=([0-9a-z-]+)&p=comments_count,从一开始循环到上一步操作取到的数值,从而得到所有评论的URL,并保存 pagecount = xparser.getnumber('//*[@class="pg"]/label/span') for page in range(1, pagecount + 1, 1): comment_url = SeventeenKComments.COMMENT_URL.format( docurl=docurl, page=page) self.storeurl(comment_url, params.originalurl, SeventeenKComments.STEP_3, {'page': page}) elif params.step == SeventeenKComments.STEP_3: # Step3: 通过Step2设置的url,得到所有评论,抽取评论 #Logger.getlogging().info("params.step == 3") page = params.customized['page'] xparser = XPathUtility(params.content) commentsinfo = xparser.getcomments( '//*[contains(@id,"postmessage")]') commentstime = self.r.parse(ur'发表于 (\d+-\d+-\d+ \d+:\d+)</em>', params.content) comments = [] #获取评论 # 设置实际的评论量 if page is 1: statrIndex = 1 else: statrIndex = 0 for index in range(statrIndex, len(commentstime), 1): cmti = CommentInfo() if URLStorage.storeupdatetime(params.originalurl, commentstime[index] + ':00'): # 获取增加的评论(根据时间比较) cmti.content = commentsinfo[index] comments.append(cmti) # 保存获取到的评论 if len(comments) > 0: self.commentstorage.store(params.originalurl, comments) else: Logger.getlogging().error( 'proparam.step == {step}'.format(step=params.step)) except Exception, e: traceback.print_exc()
def process(self, params): Logger.getlogging().info(params.url) try: if params.step is ZolbbsComments.STEP_1: #Step1: 通过得到docurl,得到获取评论的首页url。 if not self.r.search(self.FORMAT, params.originalurl): Logger.getlogging().error(params.originalurl) return value = self.r.parse(self.FORMAT, params.originalurl)[0] field = value[0] boardid = value[1] bookid = value[2] # 默认为第一页 curpage = 1 if len(value) > 3: if value[3] == '': pass else: curpage = int(value[3][1:]) # 获取总页数 totalpagestr = self.r.getid('totalPage', params.content) if totalpagestr == '': Logger.getlogging().error('Unable to get totalPage') return # 打开STEP1中URL,截取"count":num字段,取出num的值,num字段为评论总数 html = etree.HTML(params.content) xparser = XPathUtility(params.content) # 论坛评论数 comments_count = int( xparser.getnumber('//*[@id="bookTitle"]/div/em[2]')) if comments_count == 0: return cmtnum = CMTStorage.getcount(params.originalurl, True) if cmtnum >= comments_count: return # 保存最新评论数 NewsStorage.setcmtnum(params.originalurl, comments_count) # 获取当前页的评论 params.customized['page'] = curpage self.geturlcomments(params) # 拼出其他评论页面 totalPage = int(totalpagestr) # 做爬取最大页数判断 if totalPage >= self.maxpages: totalPage = self.maxpages start = int(cmtnum / self.PAGE_SIZE) + 1 end = int(totalPage) if end > start + self.maxpages: start = end - self.maxpages for page in range(end, start - 1, -1): if page == curpage: continue comment_url = ZolbbsComments.COMMENT_URL.format( field=field, boardid=boardid, bookid=bookid, page=page) self.storeurl(comment_url, params.originalurl, ZolbbsComments.STEP_2, {'page': page}) elif params.step == ZolbbsComments.STEP_2: # Step2: 得到所有评论,抽取评论 self.geturlcomments(params) else: Logger.getlogging().error( 'proparam.step == {step}'.format(step=params.step)) except Exception, e: traceback.print_exc()