def step3(self, params):
     """通过评论的url获取评论"""
     #相对之前的版本,本次更新变动:
     #comments存储的接口为CMTStorage.storecmt(),参数为originalurl, 评论内容, 评论发布时间, 用户
     #存储的内容增加了 评论发布时间, 用户
     try:
         jsondata = json.loads(params.content)
         if jsondata['comments']:
             for comment in jsondata['comments']:
                 content = comment['content']
                 curtime = TimeUtility.getuniformtime(
                     comment['create_time'])
                 nick = comment['passport']['nickname']
                 if not CMTStorage.exist(params.originalurl, content,
                                         curtime, nick):
                     CMTStorage.storecmt(params.originalurl, content,
                                         curtime, nick)
                 reply = comment['comments']
                 while reply:
                     for comment in comment['comments']:
                         content = comment['content']
                         curtime = TimeUtility.getuniformtime(
                             comment['create_time'])
                         nick = comment['passport'].get(
                             'nickname', 'anonymous')
                         if not CMTStorage.exist(params.originalurl,
                                                 content, curtime, nick):
                             CMTStorage.storecmt(params.originalurl,
                                                 content, curtime, nick)
                         reply = comment['comments']
     except:
         Logger.printexception()
         Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_SITE)
Пример #2
0
 def analysis(self, line, method):
     try:
         js = json.loads(line)
         param = ProcessParam()
         param.crawler_time = TimeUtility.getuniformtime(js['crawler_time'])
         param.url = Common.urldec(js['foundin'])
         param.content = js['html']
         if method == constant.REQUEST_TYPE_POST:
             param.data = js['data']
         if js['html'][:3] == constant.GZIP_CODE:
             param.content = zlib.decompress(param.content,
                                             16 + zlib.MAX_WBITS)
         # decode
         content = Common.urldec(param.content)
         charset = RegexUtility.getid('charset', content)
         content = Common.trydecode(content, charset)
         param.content = content
         if 'property' in js:
             for property in js['property']:
                 if not property.has_key('result'):
                     continue
                 if property['property_name'] == u'page_body':
                     param.page_body = Common.trydecode(
                         Common.urldec(property['result'][0]['text']),
                         constant.CHARSET_GBK)
                 elif property['property_name'] == u'page_title':
                     param.page_title = Common.trydecode(
                         Common.urldec(property['result'][0]['text']),
                         constant.CHARSET_GBK)
                 elif property['property_name'] == u'html_time':
                     param.html_time = TimeUtility.getuniformtime(
                         property['result'][0]['text'])
         return param
     except:
         Logger.printexception()
    def seturlinfos(params):
        id = NewsStorage.getid(params.url)
        if NewsStorage.exist(params.url):
            doc = NewsStorage.getdoc(params.url)
            data = {}
            #data[SQLDAO.SPIDER_TABLE_NEWS_TYPE] = params.type
            data[SQLDAO.SPIDER_TABLE_NEWS_TITLE] = Common.strfilter(
                params.title)
            if params.type != constant.SPIDER_S2_WEBSITE_VIDEO:
                data[SQLDAO.SPIDER_TABLE_NEWS_BODY] = Common.strfilter(
                    params.body)
            if doc.get(SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE,
                       TimeUtility.getintformtime(
                           0)) == TimeUtility.getintformtime(0):
                data[
                    SQLDAO.
                    SPIDER_TABLE_NEWS_PUBLISH_DATE] = TimeUtility.getuniformtime(
                        params.pubtime)
            data[SQLDAO.SPIDER_TABLE_NEWS_CMTNUM] = params.cmtnum
            data[SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM] = params.clicknum
            data[SQLDAO.SPIDER_TABLE_NEWS_FANSNUM] = params.fansnum
            data[SQLDAO.SPIDER_TABLE_NEWS_VOTENUM] = params.votenum
            data[SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE] = SQLDAO.gettime()
            SQLDAO.getinstance().update(SQLDAO.SPIDER_TABLE_NEWS,
                                        {SQLDAO.SPIDER_TABLE_NEWS_ID: id},
                                        data)
        else:
            data = {}
            data[SQLDAO.SPIDER_TABLE_NEWS_TYPE] = params.type
            data[SQLDAO.SPIDER_TABLE_NEWS_TITLE] = Common.strfilter(
                params.title)
            if params.type != constant.SPIDER_S2_WEBSITE_VIDEO:
                data[SQLDAO.SPIDER_TABLE_NEWS_BODY] = Common.strfilter(
                    params.body)
            data[SQLDAO.
                 SPIDER_TABLE_NEWS_PUBLISH_DATE] = TimeUtility.getuniformtime(
                     params.pubtime)
            data[SQLDAO.SPIDER_TABLE_NEWS_CMTNUM] = params.cmtnum
            data[SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM] = params.clicknum
            data[SQLDAO.SPIDER_TABLE_NEWS_FANSNUM] = params.fansnum
            data[SQLDAO.SPIDER_TABLE_NEWS_VOTENUM] = params.votenum
            data[SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE] = SQLDAO.gettime()

            data[SQLDAO.SPIDER_TABLE_NEWS_ID] = id
            data[SQLDAO.SPIDER_TABLE_NEWS_URL] = params.url
            data[SQLDAO.SPIDER_TABLE_NEWS_QUERY] = params.query
            data[SQLDAO.SPIDER_TABLE_NEWS_CHANNEL] = params.channel
            data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE] = params.createtime
            data[SQLDAO.
                 SPIDER_TABLE_NEWS_MACHINEFLAG] = NewsStorage.LOCALMACHINEFLAG
            SQLDAO.getinstance().insert(SQLDAO.SPIDER_TABLE_NEWS,
                                        SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                        SQLDAO.getvaluesfromkeys(data))
 def storecmt(url, content, pubdate, user):
     content = Common.strfilter(content)
     user = Common.strfilter(user)
     pubdate = TimeUtility.getuniformtime(pubdate)
     if not CMTStorage.exist(url, content, pubdate, user):
         Logger.getlogging().debug(
             'url:{url}, content:{content}, pubdate:{pubdate}, user:{user}'.
             format(url=url, content=content, pubdate=pubdate, user=user))
         id = CMTStorage.getid(url, content, pubdate, user)
         data = {
             SQLDAO.SPIDER_TABLE_COMMENTS_ID:
             id,
             SQLDAO.SPIDER_TABLE_COMMENTS_URL:
             url,
             SQLDAO.SPIDER_TABLE_COMMENTS_PUBLISH_DATE:
             pubdate,
             SQLDAO.SPIDER_TABLE_COMMENTS_USER:
             user,
             SQLDAO.SPIDER_TABLE_COMMENTS_CONTENT:
             content,
             SQLDAO.SPIDER_TABLE_COMMENTS_CREATE_DATE:
             SpiderConfigure.getinstance().starttime()
         }
         SQLDAO.getinstance().insert(
             SQLDAO.SPIDER_TABLE_COMMENTS,
             SQLDAO.SPIDER_TABLE_COMMENTS_KEYS,
             SQLDAO.getvaluesfromkeys(data,
                                      SQLDAO.SPIDER_TABLE_COMMENTS_KEYS))
 def getid(url, content, pubdate, user):
     content = Common.strfilter(content)
     user = Common.strfilter(user)
     pubdate = TimeUtility.getuniformtime(pubdate)
     return Common.md5(
         Common.urlenc(url) + Common.urlenc(content) + pubdate +
         Common.urlenc(user))
 def bbs_step3(self, params):
     try:
         xparser = XPathUtility(params.content)
         page = params.customized['page']
         pagecount = params.customized['pagecount']
         comments = []
         updatetimes = []
         nicks = []
         contents = xparser.getcomments('//*[@class="read"]')
         mid_times = xparser.getlist('//td[@class="authorname"]')
         for times in mid_times:
             updatetimes.append(self.r.parse(ur'于(\d+-\d+-\d+ \d+:\d+:\d+)留言', times)[0])
             nicks.append(self.r.parse(ur'(.*)于', times)[0])
         if page == 0:
             mid_index = 1
         elif page > 0:
             mid_index = 0
         comments_number = xparser.getnumber('//*[@id="msgsubject"]/font')
         if comments_number != 0:
             for index in range(mid_index, len(contents), 1):
                 curtime = TimeUtility.getuniformtime(updatetimes[index])
                 content = contents[index]
                 nick = nicks[index].split('于')[0].split('☆')[-1]
                 if not CMTStorage.exist(params.originalurl, content, curtime, nick):
                     CMTStorage.storecmt(params.originalurl, content, curtime, nick)
     except Exception, e:
         traceback.print_exc()
Пример #7
0
    def get_url_id(self, params):
        """只适用在腾讯视频的部分"""
        "cid是电视剧\合集\电影,vid单集"
        CID_PATTERN = 'https?://v\.qq\.com/x/cover/(\w+).html'
        CID_URL = 'https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&op=3&cid={cid}'
        VID_PATTERN1 = 'https?://v\.qq\.com/x/cover/\w+/(\w+).html'
        VID_PATTERN2 = 'https?://v\.qq\.com/x/page/(\w+)\.html'
        VID_URL = 'https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&op=3&vid={vid}'

        if self.r.search(CID_PATTERN, params.originalurl):
            cid = self.r.parse(CID_PATTERN, params.originalurl)[0]
            url = CID_URL.format(cid=cid)
            self.storeurl(url, params.originalurl,
                          self.STEP_COMMENT_FIRST_PAGE)
        elif self.r.search(VID_PATTERN1, params.originalurl):
            vid = self.r.parse(VID_PATTERN1, params.originalurl)[0]
            url = VID_URL.format(vid=vid)
            self.storeurl(url, params.originalurl,
                          self.STEP_COMMENT_FIRST_PAGE)
        elif self.r.search(VID_PATTERN2, params.originalurl):
            vid = self.r.parse(VID_PATTERN2, params.originalurl)[0]
            url = VID_URL.format(vid=vid)
            self.storeurl(url, params.originalurl,
                          self.STEP_COMMENT_FIRST_PAGE)
        #publish_date
        publish_date = self.r.getid('publish_date', params.content, split=':')
        if not publish_date:
            publish_date = XPathUtility(params.content).getstring(
                '//*[@class="video_tags"]/span|//*[@class="date"]|//*[@class="tag_item"]'
            )
            publish_date = TimeUtility.getuniformtime(publish_date)
        if publish_date:
            NewsStorage.setpublishdate(params.originalurl, publish_date)
        self.setclick(params)
Пример #8
0
    def step2(self, params):
        info = Common.urldec(params.customized['info'])
        soup = BeautifulSoup(params.content, 'html5lib')
        text_divs = soup.select('.s_r_txt')
        urllist = []

        if text_divs:
            for item in text_divs:
                title = item.select_one('h3 > a').get_text()
                url = item.select_one('h3 > a').get('href')
                curtime = item.select('p')[-1].get_text().strip()
                try:
                    if TimeUtility.compareNow(
                            TimeUtility.getuniformtime(curtime),
                            self.querylastdays):
                        if Common.checktitle(info, title):
                            urllist.append(url)
                        else:
                            Logger.log(
                                url, constant.ERRORCODE_WARNNING_NOMATCHTITLE)
                    else:
                        Logger.log(url,
                                   constant.ERRORCODE_WARNNING_NOMATCHTIME)
                except:
                    urllist.append(url)
                self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
    def getclick(self, params):
        pattern = 'https?://\w+\.le\.com.*/\w+/(\d+)\.html'
        if re.search(pattern, params.originalurl):
            if self.r.search(pattern, params.originalurl):
                vid = self.r.parse(pattern, params.originalurl)[0]
                playcount_url = self.PALYCOUNT_URL.format(vid=vid)
                self.storeurl(playcount_url, params.originalurl,
                              LeComments.STEP_PALY)

        if NewsStorage.getpublishdate(
                params.originalurl) == TimeUtility.getintformtime(0):
            if self.r.search('https?://sports\.le\.com/video/\d+\.html',
                             params.originalurl):
                #仅针对体育频道获取发布时间
                pubTime = XPathUtility(
                    params.content).getstring('//*[@class="live-vedio-infor"]')
                publishdate = TimeUtility.getuniformtime(publishdate)
                NewsStorage.setpublishdate(params.originalurl, publishdate)
            else:
                #仅针对综艺频道获取发布时间
                title = XPathUtility(params.content).getstring(
                    '//h1[@class="j-video-name video-name"]')
                if title:
                    if re.search('\d{8}', title):
                        publishdate = re.findall('\d{8}', title)[0]
                        NewsStorage.setpublishdate(params.originalurl,
                                                   publishdate)
Пример #10
0
 def step3bbs(self, params):
     Logger.getlogging().info("JoyComments.STEP_3")
     # Step3: 通过Step2设置的url,得到所有评论,抽取评论
     try:
         commentsinfo = json.loads(params.content)
         commentsinfo['result']['mainreplys']['rows']
     except:
         Logger.getlogging().warning(
             '{url} Errorcode:40000'.format(url=params.originalurl))
         Logger.printexception()
         return
     # 获取评论
     for index in range(
             0, int(len(commentsinfo['result']['mainreplys']['rows'])), 1):
         # 提取时间
         # cmti = CommentInfo()
         content = commentsinfo['result']['mainreplys']['rows'][index][
             'reply']['reply']['body']['text']
         curtime = TimeUtility.getuniformtime(
             str(commentsinfo['result']['mainreplys']['rows'][index]
                 ['reply']['reply']['post_time']))
         nick = commentsinfo['result']['mainreplys']['rows'][index][
             'reply']['user']['name']
         if not CMTStorage.exist(params.originalurl, content, curtime,
                                 nick):
             CMTStorage.storecmt(params.originalurl, content, curtime, nick)
    def process(self, proparam):
        Logger.getlogging().info(proparam.url)
        try:
            if proparam.step is jiemianComments.STEP_1:
                # 取得url中的id
                articleId = re.findall(r'^http://www\.jiemian\.com/\w+/(\d+)', proparam.url).__getitem__(0)
                # 设置clicknum
                self.setclick(proparam)
                # 取得评论个数
                comments_count = float(re.findall(r'"comment_count">(\d+)</span>', proparam.content).__getitem__(0))
                if comments_count:
                    NewsStorage.setcmtnum(proparam.originalurl, comments_count)
                # 取得评论件数
                if int(comments_count) == 0:
                    return

                # 增量判断
                cmtnum = CMTStorage.getcount(proparam.originalurl, True)
                if cmtnum >= comments_count:
                    return
                page_num = int(math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE))
                if page_num >= self.maxpages:
                    page_num = self.maxpages
                # 循环取得评论的url
                for page in range(1, page_num + 1, 1):
                    url = jiemianComments.COMMENTS_URL % (articleId, page)
                    self.storeurl(url, proparam.originalurl, jiemianComments.STEP_3)
            elif proparam.step == jiemianComments.STEP_3:
                # proparam.content = proparam.content.replace('\\','')
                # soup = BeautifulSoup(proparam.content, 'html5lib')
                # items = soup.select('.comment-post')
                # for item in items:
                #     content = item.select_one('.comment-main > p').get_text().encode('utf-8')
                #     curtime = TimeUtility.getuniformtime(item.select_one('.date').get_text())
                #     nick = item.select_one('.author-name').get_text().decode('utf-8').encode('utf-8')
                # 取得点赞数
                votenum = self.r.getid('ding', proparam.content)
                if votenum == '':
                    Logger.getlogging().debug("Unable to get playcount")
                else:
                    NewsStorage.setvotenum(proparam.originalurl, votenum)
                # 取得评论的正则表达式
                comments = re.findall(r'<p>(.+?)<\\/p>', proparam.content)
                ctime = re.findall(r'<span class=\\"date\\">(.+?)<\\/span>',proparam.content)
                nicks = re.findall(r'class=\\"author-name\\">(.+?)<\\/a>', proparam.content)

                # 取得评论
                for index in range(0,len(comments)):
                    time = ctime[index].replace('\\', '')
                    curtime = TimeUtility.getuniformtime(time)
                    content = eval('u"' + comments[index] + '"').encode('utf-8')
                    nick = eval('u"' + nicks[index] + '"').encode('utf-8')
                    if not CMTStorage.exist(proparam.originalurl, content, curtime, nick):
                        CMTStorage.storecmt(proparam.originalurl, content, curtime, nick)
            else:
                Logger.getlogging().error("proparam.step == %d", proparam.step)


        except Exception, e:
            traceback.print_exc()
 def geturlcomments(self, proparam):
     # soup = BeautifulSoup(proparam.content, 'html5lib')
     # lis = soup.select('.comment-say')
     # for li in lis:
     #     content = li.select_one('.des').get_text()
     #     curtime = li.select_one('.time').get_text()
     #     nick = li.select_one('.name replyName').get_text()
     #     if not CMTStorage.exist(proparam.originalurl, content, curtime, nick):
     #         CMTStorage.storecmt(proparam.originalurl, content, curtime, nick)
     # 取得评论的正则表达式
     comments = re.findall(r'content":"(.+?)","paragraph_id"',
                           proparam.content)
     commentsTime = self.r.parse(
         r'origin_created":"(\d+)","member_avatarPath"', proparam.content)
     nicks = self.r.parse(r'"nickname":"(.*?)","is_hot"', proparam.content)
     # 取得评论
     index = 0
     for comment in comments:
         comment = eval('u"' + comment + '"')
         content = comment.encode('utf-8')
         curtime = TimeUtility.getuniformtime(commentsTime[index])
         nick = eval('u"' + nicks[index] + '"')
         nick = nick.encode('utf-8')
         if not CMTStorage.exist(proparam.originalurl, content, curtime,
                                 nick):
             CMTStorage.storecmt(proparam.originalurl, content, curtime,
                                 nick)
         index = index + 1
    def geturlcomments(self, params):
        xparser = XPathUtility(params.content)
        # 取回所有评论
        page = params.customized['page']
        if page == 1:
            commentstimes = xparser.getcomments(
                '//table[position()>1]/tbody/tr/td/span[1]')
            commentscontents = xparser.getcomments(
                '//table[position()>1]/tbody/tr[2]/td[@class="post-main"]')
            commentsnicks = xparser.getcomments('//*[@class="name"]/a')
        else:
            commentstimes = xparser.getcomments('//table/tbody/tr/td/span[1]')
            commentscontents = xparser.getcomments(
                '//table/tbody/tr[2]/td[@class="post-main"]')
            commentsnicks = xparser.getcomments('//*[@class="name"]/a')

        # 设置实际的评论量
        for index in range(0, len(commentscontents), 1):
            curtime = TimeUtility.getuniformtime(commentstimes[index][4:])
            # 提取评论内容
            content = commentscontents[index].strip()
            nick = commentsnicks[index].strip()
            if not CMTStorage.exist(params.originalurl, content, curtime,
                                    nick):
                CMTStorage.storecmt(params.originalurl, content, curtime, nick)
Пример #14
0
    def step3news(self, params):
        Logger.getlogging().info("ZolbbsComments.STEP_3")
        # Step3: 通过Step2设置的url,得到所有评论,抽取评论
        xparser = XPathUtility(params.content)
        commentsinfo = xparser.getcomments(
            '//*[@class="comment-list-new"]//*[@class="commli"]/p')
        commentstime = xparser.getcomments(
            '//*[@class="comment-list-new"]//*[@class="published-time"]')
        commentsnick = xparser.getcomments(
            '//*[@class="comment-list-new"]//*[@class="user-name"]')
        # 获取评论,设置实际的评论量
        for index in range(0, len(commentstime), 1):
            # 提取时间
            tm = commentstime[index].strip()
            try:
                curtime = TimeUtility.getuniformtime(getuniformtime(tm),
                                                     u'%Y-%m-%d %H:%M')
            except Exception, e:
                curtime = getuniformtime(tm)

            # 提取评论内容
            content = commentsinfo[index]
            nick = commentsnick[index]
            if not CMTStorage.exist(params.originalurl, content, curtime,
                                    nick):
                CMTStorage.storecmt(params.originalurl, content, curtime, nick)
Пример #15
0
    def step3_ebook(self, params):
        try:
            jsoncontent = json.loads(params.content)
            if not jsoncontent.has_key('data'):
                return
            html = jsoncontent['data']['listHtml']
            if not html:
                return
            soup = BeautifulSoup(html, 'lxml')
            divs = soup.select('div.cf')
            if not divs:
                return
            for div in divs:
                # commentList > dl:nth-child(1) > div.cf > dd > p:nth-child(2)
                content = div.select('dd > p')[1].get_text()

                curtime = TimeUtility.getuniformtime(
                    div.select('dd > p')[0].get_text().split('|')[-1])
                nick = div.select('dd > p')[0].get_text().split('|')[0]

                if not CMTStorage.exist(params.originalurl, content, curtime,
                                        nick):
                    CMTStorage.storecmt(params.originalurl, content, curtime,
                                        nick)

        except Exception, e:
            Logger.printexception()
Пример #16
0
 def analysis(self, line, method):
     try:
         js = json.loads(line)
         param = ProcessParam()
         param.crawler_time = TimeUtility.getuniformtime(js['crawler_time'])
         param.url = Common.urldec(js['foundin'])
         param.content = js['html']
         if method == constant.REQUEST_TYPE_POST:
             param.data = js['data']
         if js['html'][:3] == constant.GZIP_CODE:
             param.content = zlib.decompress(param.content, 16 + zlib.MAX_WBITS)
         # decode
         content = Common.urldec(param.content)
         charset = RegexUtility.getid('charset', content)
         content = Common.trydecode(content, charset)
         param.content = content
         return param
     except:
         line = line.replace('\n', '').strip()
         if not line or line[0] == '#':
             return
         Logger.getlogging().debug(line)
         param = ProcessParam()
         param.url = line
         if method == constant.REQUEST_TYPE_POST:
             js = json.loads(line)
             param.url = js['url']
             param.data = js['data']
         param.content = HttpCacher.getcontent(line, method)
         if param.content is None:
             return
         return param
Пример #17
0
    def step2(self, params):
        keyword = params.customized['keyword']
        query = Common.urldec(keyword)
        jsondata = json.loads(params.content)
        # 获取分页数
        html = jsondata['html']
        soup = bs(html, 'html5lib')
        videoUrlList = []

        videoList = soup.select('li.video')
        for video in videoList:
            try:
                videoUrl = 'https:' + video.select_one('a').get('href')
                videoUrl = videoUrl.split('?')[0] + '/'
                title = video.select_one('a').get('title')
                pubtime = video.find(attrs={
                    'class': 'so-icon time'
                }).get_text().strip()
                if self.compareNow(TimeUtility.getuniformtime(pubtime)):
                    if self.checktitle(query, title):
                        videoUrlList.append(videoUrl)
                        self.__storeurl__(videoUrl, pubtime,
                                          SPIDER_S2_WEBSITE_VIDEO)
                    else:
                        Logger.log(videoUrl,
                                   constant.ERRORCODE_WARNNING_NOMATCHTITLE)
                else:
                    Logger.log(videoUrl,
                               constant.ERRORCODE_WARNNING_NOMATCHTIME)
            except:
                Logger.printexception()
Пример #18
0
 def dmzjnews_step3(self, params):
     params.content = params.content[params.content.index('['):params.content.rindex(']') + 1]
     commentsinfo = json.loads(params.content)
     for index in range(0, len(commentsinfo), 1):
         # 提取时间
         content = commentsinfo[index]['content']
         curtime = TimeUtility.getuniformtime(commentsinfo[index]['create_time'])
         CMTStorage.storecmt(params.originalurl, content, curtime, '')
Пример #19
0
    def process_book(self, params):
        try:
            if params.step == Comments.STEP_1:
                # 从url中获取拼接评论url的参数
                bookId = self.r.parse('^http://www\.17k\.com/book/(\w+).html$',
                                      params.originalurl)[0]
                # 拼接第一页评论url
                comments_url = Comments.COMMENTS_URL % (bookId, 1,
                                                        Comments.PAGE_SIZE)
                #通知下载平台,根据评论url获取第一页评论内容
                self.storeurl(comments_url, params.originalurl,
                              Comments.STEP_2, {'bookId': bookId})

            #获取第一页评论内容,循环获取全部评论url
            elif params.step == Comments.STEP_2:
                bookId = params.customized['bookId']
                # 获取评论的Jason返回值
                comments = json.loads(params.content)

                comments_count = int(comments['page']['count'])
                # 判断增量
                cmtnum = CMTStorage.getcount(params.originalurl)
                if cmtnum >= comments_count:
                    return
                NewsStorage.setcmtnum(params.originalurl, comments_count)
                # 获取评论最后更新时间
                lasttime = CMTStorage.getlastpublish(params.originalurl, True)
                # 获取评论页数
                page_count = int(comments['page']['pagecount'])
                if page_count == 0:
                    return

                if page_count >= self.maxpages:
                    page_count = self.maxpages

                # 循环拼接评论url,提交下载平台获取评论数据
                for page in range(1, page_count + 1, 1):
                    commentUrl = Comments.COMMENTS_URL % (bookId, page,
                                                          Comments.PAGE_SIZE)
                    self.storeurl(commentUrl, params.originalurl,
                                  Comments.STEP_3, {'bookId': bookId})

            #解析评论数据
            elif params.step == Comments.STEP_3:
                commentsinfo = json.loads(params.content)

                for comment in commentsinfo['page']['result']:
                    curtime = TimeUtility.getuniformtime(
                        comment['creationDate'])
                    content = comment['summary']
                    nick = comment['marks']['nikeName']
                    if not CMTStorage.exist(params.originalurl, content,
                                            curtime, nick):
                        CMTStorage.storecmt(params.originalurl, content,
                                            curtime, nick)

        except Exception, e:
            traceback.print_exc()
Пример #20
0
    def step3(self, params):
        try:
            Logger.getlogging().info("Kr36Comments.STEP_3")
            # Step3: 通过Step2设置的url,得到所有评论,抽取评论
            jsoncontent = json.loads(params.content)
            lasttime = params.customized
            for index in range(0, len(jsoncontent['data']['items']), 1):
                # 提取评论内容
                content = jsoncontent['data']['items'][index]['content']
                # 提取时间
                publicTime = jsoncontent['data']['items'][index]['created_at']
                curtime = TimeUtility.getuniformtime(TimeUtility.getuniformtime(publicTime, u'%Y-%m-%d %H:%M:%S'))
                nick = jsoncontent['data']['items'][index]['user']['name']

                if not CMTStorage.exist(params.originalurl, content, curtime, nick):
                    CMTStorage.storecmt(params.originalurl, content, curtime, nick)
        except:
            Logger.printexception()
Пример #21
0
 def process(self, params):
     if params.step == S2Query.STEP_1:
         html = etree.HTML(params.content)
         isblack = False
         #判断页面是不是空的
         try:
             contenttext = html.xpath(
                 '//div[@class="attentionTitle"]/text()')
             if contenttext <> []:
                 isblack = True
         except:
             isblack = False
         #不是空的情况下,判断是不是又翻页
         if isblack == False:
             try:
                 page = html.xpath(
                     '//*[@class="pcgames_page"]/a/text()')[-2]
             except:
                 page = '1'
             index = 0
             urllist = []
             if self.pagelimit:
                 if int(page) > self.pagelimit:
                     Logger.getlogging().warning(
                         'the pageMaxNumber is shutdown to {0}'.format(
                             self.pagelimit))
                     page = self.pagelimit
             for index in range(0, int(page)):
                 searchurl = S2Query.S2_URL % (params.customized['key'],
                                               str(index + 1))
                 urllist.append(searchurl)
             self.__storeqeuryurllist__(urllist, S2Query.STEP_2, {
                 'key': params.customized['key'],
                 'page': page
             })
     elif params.step == S2Query.STEP_2:
         html = etree.HTML(params.content)
         #正式环境获得数据库记载的上一次获取时候的最新评论的时间
         #urlcommentinfo= URLStorage.geturlcommentinfo().updatetime
         #测试环境模拟数据
         urlcommentinfo = (datetime.datetime.now() + datetime.timedelta(
             days=-int(self.querylastdays))).strftime('%Y-%m-%d %H:%M:%S')
         urllist = html.xpath(
             '//div[@class="resultList"]/ul/li/p/a[1]/@href')
         pbtime = html.xpath(
             '//*[@class="resultList"]/ul/li/span[1]/text()')
         index = 0
         url = []
         for index in range(0, len(pbtime)):
             pbtime[index] = pbtime[index].replace('-', '').strip()
             pbtime[index] = TimeUtility.getuniformtime(
                 pbtime[index], '%a %b %d %H:%M:%S CST %Y')
             if pbtime[index] > urlcommentinfo:
                 url.append(urllist[index])
         if len(url) > 0:
             self.__storeurllist__(url, SPIDER_S2_WEBSITE_TIEBA)
    def geturlcomments(self, params, startpos=0):
        # 取得所有评论
        soup = BeautifulSoup(params.content, 'html5lib')
        comments = soup.select('.info')
        commentTimes = soup.select('.date')
        commentsInfo = []

        # //*[contains(@id,"postmessage_")]
        if len(comments) <= 0:
            tds = soup.select(
                'td.plc')  # soup.find_all("td", attrs={"class": "plc"})
            if tds is None:
                return
            for td in tds:
                timestr = td.find(attrs={'id': re.compile('authorposton')})
                if not timestr:
                    continue
                commentTimes = getuniformtime(timestr.get_text())
                if URLStorage.storeupdatetime(params.originalurl,
                                              commentTimes):
                    contents = td.find(
                        attrs={'id': re.compile('postmessage_')})
                    if contents:
                        cmti = CommentInfo()
                        cmti.content = contents.get_text()
                        commentsInfo.append(cmti)

        else:
            # 取得所有评论
            for index in range(startpos, int(len(comments)), 1):
                # 提取时间
                cmti = CommentInfo()
                publicTime = getuniformtime(
                    commentTimes[index].get_text()).strip()
                #publicTime = self.r.parse(ur'发表于(.*)', publicTime)[0].strip()
                tm = TimeUtility.getuniformtime(
                    TimeUtility.getuniformtime(publicTime, u'%Y-%m-%d %H:%M'))
                if URLStorage.storeupdatetime(params.originalurl, tm):
                    cmti.content = comments[index].get_text()
                    commentsInfo.append(cmti)

        if len(commentsInfo) > 0:
            self.commentstorage.store(params.originalurl, commentsInfo)
Пример #23
0
 def getcomments(self, params):
     comments = json.loads(params.content)
     # 获取评论
     for item in comments['data']:
         curtime = TimeUtility.getuniformtime(item['ctime'])
         content = item['content']
         nick = item['user']['username']
         if not CMTStorage.exist(params.originalurl, content, curtime,
                                 nick):
             CMTStorage.storecmt(params.originalurl, content, curtime, nick)
Пример #24
0
 def step_click(self, params):
     sid = params.customized['sid']
     infodata = json.loads(params.content)
     for info in infodata:
         if info['id'] == str(sid):
             addtime = TimeUtility.getuniformtime(info['adddate'])
             playcount = self.str2num(info['playtimes'])
             NewsStorage.setclicknum(params.originalurl, playcount)
             NewsStorage.setpublishdate(params.originalurl, addtime)
             break
Пример #25
0
    def step3(self, params):
        Logger.getlogging().info("ThirtysixKryptonComments.STEP_3")
        # Step3: 通过Step2设置的url,得到所有评论,抽取评论
        jsoncontent = json.loads(params.content)
        commentsInfo = []

        for index in range(0, len(jsoncontent['data']['items']), 1):
            cmti = CommentInfo()
            # 提取评论内容
            cmti.content = jsoncontent['data']['items'][index]['content']
            # 提取时间
            publicTime = jsoncontent['data']['items'][index]['created_at']
            tm = TimeUtility.getuniformtime(TimeUtility.getuniformtime(publicTime, u'%Y-%m-%d %H:%M:%S'))
            if URLStorage.storeupdatetime(params.originalurl, tm):
                commentsInfo.append(cmti)

        if len(commentsInfo) > 0:
            # 保存获取的评论
            self.commentstorage.store(params.originalurl, commentsInfo)
Пример #26
0
    def process(self, params):
        try:
            if params.step is None:
                # 从url中获取拼接评论url的参数
                oid = self.r.parse('^http://v\.ku6\.com/show/([\w-]+..).html',
                                   params.originalurl)[0]
                # 拼接第一页评论url
                comments_url = Ku6Comments.COMMENTS_URL % (oid, 1, 1)
                #通知下载平台,根据评论url获取第一页评论内容
                self.storeurl(comments_url, params.originalurl,
                              Ku6Comments.STEP_2, {'oid': oid})

            #获取第一页评论内容,循环获取全部评论url
            elif params.step == Ku6Comments.STEP_2:
                oid = params.customized['oid']
                # 获取评论的Jason返回值
                comments = json.loads(params.content)

                # 获取评论总数
                comments_count = float(comments['data']['count'])
                NewsStorage.setcmtnum(params.originalurl,
                                      int(comments['data']['count']))
                if comments_count == 0:
                    return
                # 比较上次抓取该url的页面评论量和当前取到的评论量
                cmtnum = CMTStorage.getcmtnum(params.originalurl, True)
                if cmtnum >= comments_count:
                    return
                # 循环拼接评论url,提交下载平台获取评论数据
                for page in range(
                        0,
                        int(math.ceil(comments_count / Ku6Comments.PAGE_SIZE))
                        + 1, 1):
                    commentUrl = Ku6Comments.COMMENTS_URL % (
                        oid, Ku6Comments.PAGE_SIZE, page + 1)
                    self.storeurl(commentUrl, params.originalurl,
                                  Ku6Comments.STEP_3, {'oid': oid})

            #解析评论数据
            elif params.step == Ku6Comments.STEP_3:
                commentsinfo = json.loads(params.content)
                if not commentsinfo['data']['list']:
                    return
                for comment in commentsinfo['data']['list']:
                    curtime = TimeUtility.getuniformtime(
                        int(comment['commentCtime']))
                    content = comment['commentContent']
                    nick = comment['commentContent']
                    if not CMTStorage.exist(params.originalurl, content,
                                            curtime, nick):
                        CMTStorage.storecmt(params.originalurl, content,
                                            curtime, nick)

        except Exception, e:
            Logger.printexception()
Пример #27
0
 def step2(self, params):
     """获取评论的其他url,及评论"""
     #每次spider运行的累加数据tempcmttotal
     #
     try:
         url_id = params.customized['url_id']
         comment_id = params.customized['comment_id']
         before_update = params.customized['before_update']
         tempcmttotal = params.customized.get('tempcmttotal', 0)
         try:
             jsondata = json.loads(params.content)
             last = jsondata['data']['last']
             hasnext = jsondata['data']['hasnext']
             cmttotal = float(jsondata['data']['total'])
             NewsStorage.setcmtnum(params.originalurl, cmttotal)
         except:
             Logger.log(params.originalurl,
                        constant.ERRORCODE_SITE_NOGET_COMMNETS)
             return
         temptimes = []
         for comment in jsondata['data']['commentid']:
             tempcmttotal += 1
             content = comment['content']
             time = TimeUtility.getuniformtime(comment['time'])
             temptimes.append(time)
             user = comment['userinfo'].get('nick', 'anonymous')
             # 保存评论到数据库,可以通过接口exist判断评论是否已经存在
             CMTStorage.storecmt(params.originalurl, content, time, user)
         #对是否继续提取评论进行条件限制
         nextflag = True
         if temptimes:
             min_update = min(temptimes)
             max_update = max(temptimes)
             #发布时间临界点限制:最近两天
             #if max_update < self.cmtlastdays:
             #nextflag = False
             #发布时间限制:仅针对qq的评论提取策略,该评论的发布时间有序且依次递减
             if min_update < before_update:
                 nextflag = False
         #数量限制
         if tempcmttotal >= self.comment_maxnum:
             nextflag = False
         if float(tempcmttotal) / self.page_size > self.maxpages:
             nextflag = False
         if hasnext and nextflag:
             url = self.COMMENTS_URL.format(url_id, last, self.page_size)
             self.storeurl(
                 url, params.originalurl, self.STEP_COMMENT_NEXT_PAGE, {
                     'url_id': url_id,
                     'comment_id': last,
                     'before_update': before_update,
                     'tempcmttotal': tempcmttotal
                 })
     except:
         Logger.printexception()
 def step2bbs(self, params):
     soup = BeautifulSoup(params.content, 'html5lib')
     lis = soup.find_all(attrs={'id': re.compile('comment-\d+')})
     for li in lis:
         try:
             curtime = li.select_one('.mh-comment-meta-date').get_text()
             curtime = TimeUtility.getuniformtime(curtime)
             content = li.select_one('.mh-comment-content').get_text()
             CMTStorage.storecmt(params.originalurl, content, curtime, '')
         except:
             Logger.printexception()
Пример #29
0
 def step3(self, params):
     # Step3: 通过Step2设置的url,得到所有评论,抽取评论
     commentsinfo = json.loads(params.content[2:-1])
     contents = commentsinfo['data']['weibo']
     for item in contents:
         curtime = TimeUtility.getuniformtime(item['pub_time'])
         content = item['content']
         nick = str(item['userinfo']['nickname'])
         if not CMTStorage.exist(params.originalurl, content, curtime,
                                 nick):
             CMTStorage.storecmt(params.originalurl, content, curtime, nick)
    def news_step2(self, params):
        """通过评论的url获取评论"""
        try:
            jsondata = json.loads(params.content)
            for comment in jsondata['body']:
                content = str(comment['commentbody'])
                nick = str(comment['commentauthor'])
                curtime = TimeUtility.getuniformtime(comment['commentdate'])
                if not CMTStorage.exist(params.originalurl, content, curtime, nick):
                    CMTStorage.storecmt(params.originalurl, content, curtime, nick)

                if int(comment['reply_total']) > 0:
                    for index in range(0, int(comment['reply_total']), 1):
                        content = comment['reply'][index]['commentbody']
                        curtime = TimeUtility.getuniformtime(comment['reply'][index]['commentdate'])
                        nick = comment['reply'][index]['commentauthor']
                        if not CMTStorage.exist(params.originalurl, content, curtime, nick):
                            CMTStorage.storecmt(params.originalurl, content, curtime, nick)
        except:
            Logger.printexception()