예제 #1
0
 def process(self, params):
     if params.step == IfengS2Query.IFENG_S2QUERY_FIRST_PAGE:
         q = params.customized['query']
         # html = etree.HTML(params.content)
         xparser = XPathUtility(params.content)
         mid_count = xparser.getnumber('//div[@class="serpinfo"]/span/em')
         count = str(mid_count).strip()
         querylist = []
         # 获取不到,则返回
         if count == 0:
             return
         elif count > 0:
             pagenum = int(
                 math.ceil(float(count) / IfengS2Query.DEFAULT_PAGE_SIZE))
             if pagenum >= self.maxpages:
                 pagenum = self.maxpages
             for page in range(1, pagenum + 1, 1):
                 url = IfengS2Query.IFENG_QUERY_TEMPLATE.format(pn=page,
                                                                q=q)
                 querylist.append(url)
             self.__storeqeuryurllist__(
                 querylist, IfengS2Query.IFENG_S2QUERY_EACH_PAGE,
                 {'info': q})
     elif params.step == IfengS2Query.IFENG_S2QUERY_EACH_PAGE:
         self.step2(params)
 def bbs_step3(self, params):
     try:
         xparser = XPathUtility(params.content)
         page = params.customized['page']
         pagecount = params.customized['pagecount']
         comments = []
         updatetimes = []
         nicks = []
         contents = xparser.getcomments('//*[@class="read"]')
         mid_times = xparser.getlist('//td[@class="authorname"]')
         for times in mid_times:
             updatetimes.append(self.r.parse(ur'于(\d+-\d+-\d+ \d+:\d+:\d+)留言', times)[0])
             nicks.append(self.r.parse(ur'(.*)于', times)[0])
         if page == 0:
             mid_index = 1
         elif page > 0:
             mid_index = 0
         comments_number = xparser.getnumber('//*[@id="msgsubject"]/font')
         if comments_number != 0:
             for index in range(mid_index, len(contents), 1):
                 curtime = TimeUtility.getuniformtime(updatetimes[index])
                 content = contents[index]
                 nick = nicks[index].split('于')[0].split('☆')[-1]
                 if not CMTStorage.exist(params.originalurl, content, curtime, nick):
                     CMTStorage.storecmt(params.originalurl, content, curtime, nick)
     except Exception, e:
         traceback.print_exc()
    def bbs_step2(self, params):
        try:
            xparser = XPathUtility(params.content)
            comment_counts = int(xparser.getnumber('//*[@id="msgsubject"]/font'))
            if comment_counts == 0:
                return
            cmtnum = CMTStorage.getcount(params.originalurl, True)
            # 判断增量
            if cmtnum >= comment_counts:
                return
            pagecount = xparser.getnumber('//*[@id="pager_top"]')

            for page in range(0, pagecount + 1, 1):
                commentUrl = JjwxcBbsComments.COMMENTS_URL.format(url=params.originalurl, pageno=page)
                Logger.getlogging().debug(commentUrl)
                self.storeurl(commentUrl, params.originalurl, JjwxcBbsComments.BBS_NEXT_PAGE, {'page': page,'pagecount':pagecount})
                NewsStorage.setcmtnum(params.originalurl, comment_counts)
        except Exception, e:
            traceback.print_exc()
 def step2bbs(self, params):
     Logger.getlogging().info("Ea3wcomments.STEP_2")
     commentinfo_url = params.customized['commentinfo_url'] + "&load=all"
     xparser = XPathUtility(params.content)
     comments_count = xparser.getnumber('//div[@class="at-comment"]/a/span')
     # 保存页面评论量
     cmtnum = URLStorage.getcmtnum(params.originalurl)
     if cmtnum >= comments_count:
         return
     URLStorage.setcmtnum(params.originalurl, comments_count)
     self.storeurl(commentinfo_url, params.originalurl,
                   Ea3wcomments.STEP_3_BBS)
예제 #5
0
    def process(self, params):
        Logger.getlogging().info(params.url)
        try:
            if params.step is Dm123BbsComments.STEP_1:
                xparser = XPathUtility(params.content)
                #通过第一次传进来的URL判断是否有后续页面
                keyvalue = self.r.parse('tid-(.*?).html', params.url)[0]
                pagecount = xparser.getnumber(
                    '//*[@class="pages"]/div[@class="fl"]')
                commentinfo_url = params.url
                self.storeurl(commentinfo_url, params.originalurl,
                              Dm123BbsComments.STEP_2, {
                                  'keyvalue': keyvalue,
                                  'totalpage': pagecount,
                                  'curpage': 1
                              })

            elif params.step == Dm123BbsComments.STEP_2:
                keyvalue = params.customized['keyvalue']
                curpage = params.customized['curpage']
                xparser = XPathUtility(params.content)
                commentsinfo = xparser.getcomments(
                    '//div[contains(@class,"tpc_content")]')
                commentstime = self.r.parse(ur'\"(\d+-\d+-\d+ \d+:\d+)\">发表于:',
                                            params.content)
                comments = []
                for index in range(0, len(commentstime)):
                    cmti = CommentInfo()
                    if URLStorage.storeupdatetime(
                            params.originalurl,
                            TimeUtility.getuniformtime(commentstime[0] +
                                                       ':00')):
                        # 获取增加的评论(根据时间比较)
                        cmti.content = commentsinfo[index]
                        comments.append(cmti)
                if len(comments) > 0:
                    self.commentstorage.store(params.originalurl, comments)
                nextpageList = [keyvalue, "-page-", str(curpage + 1)]
                nextpage = ''
                nextpage = nextpage.join(nextpageList)
                if int(nextpageList[2]) <= int(params.customized['totalpage']):
                    comment_url = Dm123BbsComments.COMMENT_URL.format(
                        page=nextpage)
                    self.storeurl(
                        comment_url, params.originalurl,
                        Dm123BbsComments.STEP_2, {
                            'keyvalue': nextpageList[0],
                            'totalpage': params.customized['totalpage'],
                            'curpage': curpage + 1
                        })

        except Exception, e:
            traceback.print_exc()
    def step1(self, params):
        """"""
        #Step1: 通过得到docurl,得到获取评论的url的参数。
        #docurl = self.r.parse('^http://bbs\.hupu\.com\/(\d+)', params.originalurl)
        docurl = self.r.parse('^http[s]{0,1}://bbs\.hupu\.com\/(\d+)',
                              params.originalurl)
        if docurl:
            docurl = docurl[0]
        else:
            Logger.getlogging().debug(
                '{url}:20000'.format(url=params.originalurl))
            return
        # 取得正文
        xparser = XPathUtility(params.content)
        #取得页数
        pageList = xparser.getcomments('//div[@class="page"]/a')
        if not pageList:
            pagenum = 1
        elif pageList:
            pagenum = pageList[-2]
        else:
            return
        if int(pagenum) >= self.maxpages:
            pagenum = self.maxpages
        # 评论总数
        curcmtnum = xparser.getnumber('//span[@class="browse"]')
        NewsStorage.setcmtnum(params.originalurl, curcmtnum)
        dbcmtnum = CMTStorage.getcount(params.originalurl, True)
        if dbcmtnum >= curcmtnum:
            return

        start = int(dbcmtnum / self.page_size) + 1
        end = int(pagenum)
        if end > start + self.maxpages:
            start = end - self.maxpages

        params.customized['page'] = 1
        if end == 1:
            self.step2(params)
            return
        if start == 1:
            self.step2(params)
        comment_url = self.COMMENT_URL.format(docurl=docurl, page=end)
        self.storeurl(comment_url, params.originalurl, hupuComments.STEP_1_2, {
            'docurl': docurl,
            'page': end,
            'start': start,
            'end': end
        })
 def step1(self, params):
     """"""
     #Step2: 根据返回内容,通过xpath 得到搜索结果总数
     info = params.customized['query']
     xhtml = XPathUtility(html=params.content)
     video_counts = xhtml.getnumber('//*[@class="act"]/span/em')
     Logger.getlogging().debug(video_counts)
     # 获取不到,则返回
     if video_counts == 0:
         return
     page_count = int(
         math.ceil(video_counts / DuowanS2Query.DEFAULT_PAGE_SIZE))
     if page_count >= self.maxpages:
         page_count = self.maxpages
     # 根据上面的page_count数,拼出所有的搜索结果url
     querylist = []
     for page in range(1, page_count + 1, 1):
         url = DuowanS2Query.QUERY_TEMPLATE.format(key=info, pageno=page)
         querylist.append(url)
     self.__storeqeuryurllist__(querylist, DuowanS2Query.S2QUERY_EACH_PAGE,
                                {'query': info})
예제 #8
0
    def step2(self, params):
        Logger.getlogging().info("Flash8Comments.STEP_2")
        # 将STEP_1中的docurl传下来
        docurl = params.customized['docurl']
        xparser = XPathUtility(params.content)
        commentsinfo = xparser.getstring('//div[@class="page"]/span/font[1]')

        # 保存页面评论量
        cmtnum = URLStorage.getcmtnum(params.originalurl)
        if cmtnum >= int(commentsinfo[0]):
            return
        URLStorage.setcmtnum(params.originalurl, int(commentsinfo[0]))

        # 总数除以page_size,然后加1,可得到评论总页数comments_count
        pagecount = xparser.getnumber('//*[@class="pg"]/label/span')
        if pagecount == 0:
            pagecount = pagecount + 1

        for page in range(1, pagecount + 1, 1):
            comment_url = Flash8Comments.COMMENT_URL.format(docurl=docurl,
                                                            page=page)
            self.storeurl(comment_url, params.originalurl,
                          Flash8Comments.STEP_3, {'page': page})
    def step2(self, params):
        Logger.getlogging().info("Dm123NewsComments.STEP_2")
        classid = params.customized['classid']
        id = params.customized['id']
        xparser = XPathUtility(params.content)
        # 评论总数(当评论不满一页时,直接获取到的comments_count为0)
        comments_count = xparser.getnumber('//div/a[1]/b')

        # comments_count为0时分两种情况,真的没有评论和有评论
        if 0 == comments_count:
            commentsinfos = xparser.getcomments('//div[@class="rbvalueout"]')
            commentstimes = xparser.getcomments('//span[@class="rbtime"]')
            # comments_count重新赋值
            comments_count = len(commentsinfos)
            if 0 == comments_count:
                return
            else:
                # 判断增量
                cmtnum = URLStorage.getcmtnum(params.originalurl)
                if cmtnum >= comments_count:
                    return
                URLStorage.setcmtnum(params.originalurl, comments_count)
                self.storeurl(params.originalurl, params.originalurl, Dm123NewsComments.STEP_3,
                              {'is_only_one_page': True, 'commentsinfos': commentsinfos,
                               'commentstimes': commentstimes})
        else:
            # 判断增量
            cmtnum = URLStorage.getcmtnum(params.originalurl)
            if cmtnum >= comments_count:
                return
            URLStorage.setcmtnum(params.originalurl, comments_count)
            # 评论页数
            page_count = int(math.ceil(float(comments_count) / self.page_size))
            for page in range(0, int(page_count), 1):
                comment_url = Dm123NewsComments.COMMENT_URL.format(page=page, classid=classid, id=id)
                self.storeurl(comment_url, params.originalurl, Dm123NewsComments.STEP_3,
                              {'is_only_one_page': False})
    def process(self, params):
        Logger.getlogging().info(params.url)
        try:
            if params.step is SeventeenKComments.STEP_1:
                #Step1: 通过得到docurl,得到获取评论的首页url。
                #Logger.getlogging().info("proparam.step is None")
                # 在视频url中取出docurl,^http://v\.ifeng\.com\/\w+\/\w+/\d{6}\/[0-9a-z-]+\.shtml
                # 取URL中的([0-9a-z-]+)参数,此参数为docurl
                docurl = self.r.parse(
                    '^http://bbs\.17k\.com\/thread-(\d+)-\d+-1\.html',
                    params.originalurl)[0]
                #Logger.getlogging().debug(docurl)
                # 评论首页URL为http://comment.ifeng.com/getv.php?job=1&docurl=([0-9a-z-]+)&p=1
                commentinfo_url = 'http://bbs.17k.com/thread-{docurl}-1-1.html'.format(
                    docurl=docurl)
                self.storeurl(commentinfo_url, params.originalurl,
                              SeventeenKComments.STEP_2, {'docurl': docurl})

            elif params.step == SeventeenKComments.STEP_2:
                #将STEP_1中的docurl传下来
                docurl = params.customized['docurl']
                # Step2: 通过Step1设置url,得到评论的总数,并根据评论总数得到获取其他评论的url。
                #Logger.getlogging().info("params.step == 2")
                # 打开STEP1中URL,截取"count":num字段,取出num的值,num字段为评论总数
                xparser = XPathUtility(params.content)
                commentsinfo = xparser.getnumber(
                    '//*[@class="hm ptn"]/span[5]')

                #Logger.getlogging().debug(comments_count / self.page_size)
                #Logger.getlogging().debug(math.ceil(comments_count / self.page_size))

                # 保存页面评论量
                cmtnum = URLStorage.getcmtnum(params.originalurl)
                if cmtnum >= int(commentsinfo):
                    return
                URLStorage.setcmtnum(params.originalurl, int(commentsinfo))

                # 总数除以page_size,然后加1,可得到评论总页数comments_count
                # 循环http://comment.ifeng.com/getv.php?job=1&docurl=([0-9a-z-]+)&p=comments_count,从一开始循环到上一步操作取到的数值,从而得到所有评论的URL,并保存
                pagecount = xparser.getnumber('//*[@class="pg"]/label/span')

                for page in range(1, pagecount + 1, 1):
                    comment_url = SeventeenKComments.COMMENT_URL.format(
                        docurl=docurl, page=page)
                    self.storeurl(comment_url, params.originalurl,
                                  SeventeenKComments.STEP_3, {'page': page})

            elif params.step == SeventeenKComments.STEP_3:
                # Step3: 通过Step2设置的url,得到所有评论,抽取评论
                #Logger.getlogging().info("params.step == 3")
                page = params.customized['page']
                xparser = XPathUtility(params.content)
                commentsinfo = xparser.getcomments(
                    '//*[contains(@id,"postmessage")]')
                commentstime = self.r.parse(ur'发表于 (\d+-\d+-\d+ \d+:\d+)</em>',
                                            params.content)
                comments = []

                #获取评论
                # 设置实际的评论量
                if page is 1:
                    statrIndex = 1
                else:
                    statrIndex = 0
                for index in range(statrIndex, len(commentstime), 1):
                    cmti = CommentInfo()
                    if URLStorage.storeupdatetime(params.originalurl,
                                                  commentstime[index] + ':00'):
                        # 获取增加的评论(根据时间比较)
                        cmti.content = commentsinfo[index]
                        comments.append(cmti)
                # 保存获取到的评论
                if len(comments) > 0:
                    self.commentstorage.store(params.originalurl, comments)

            else:
                Logger.getlogging().error(
                    'proparam.step == {step}'.format(step=params.step))

        except Exception, e:
            traceback.print_exc()
    def process(self, params):
        Logger.getlogging().info(params.url)
        try:
            if params.step is ZolbbsComments.STEP_1:
                #Step1: 通过得到docurl,得到获取评论的首页url。
                if not self.r.search(self.FORMAT, params.originalurl):
                    Logger.getlogging().error(params.originalurl)
                    return
                value = self.r.parse(self.FORMAT, params.originalurl)[0]
                field = value[0]
                boardid = value[1]
                bookid = value[2]

                # 默认为第一页
                curpage = 1
                if len(value) > 3:
                    if value[3] == '':
                        pass
                    else:
                        curpage = int(value[3][1:])

                # 获取总页数
                totalpagestr = self.r.getid('totalPage', params.content)
                if totalpagestr == '':
                    Logger.getlogging().error('Unable to get totalPage')
                    return

                # 打开STEP1中URL,截取"count":num字段,取出num的值,num字段为评论总数
                html = etree.HTML(params.content)
                xparser = XPathUtility(params.content)
                # 论坛评论数
                comments_count = int(
                    xparser.getnumber('//*[@id="bookTitle"]/div/em[2]'))
                if comments_count == 0:
                    return
                cmtnum = CMTStorage.getcount(params.originalurl, True)
                if cmtnum >= comments_count:
                    return
                # 保存最新评论数
                NewsStorage.setcmtnum(params.originalurl, comments_count)

                # 获取当前页的评论
                params.customized['page'] = curpage
                self.geturlcomments(params)

                # 拼出其他评论页面
                totalPage = int(totalpagestr)
                # 做爬取最大页数判断
                if totalPage >= self.maxpages:
                    totalPage = self.maxpages

                start = int(cmtnum / self.PAGE_SIZE) + 1
                end = int(totalPage)
                if end > start + self.maxpages:
                    start = end - self.maxpages

                for page in range(end, start - 1, -1):
                    if page == curpage:
                        continue
                    comment_url = ZolbbsComments.COMMENT_URL.format(
                        field=field, boardid=boardid, bookid=bookid, page=page)
                    self.storeurl(comment_url, params.originalurl,
                                  ZolbbsComments.STEP_2, {'page': page})

            elif params.step == ZolbbsComments.STEP_2:
                # Step2: 得到所有评论,抽取评论
                self.geturlcomments(params)
            else:
                Logger.getlogging().error(
                    'proparam.step == {step}'.format(step=params.step))

        except Exception, e:
            traceback.print_exc()