def setclick(self, params):
     playcount = self.r.getid('play_count', params.content)
     votenum = self.r.getid('up', params.content)
     if playcount:
         NewsStorage.setclicknum(params.originalurl, playcount)
     if votenum:
         NewsStorage.setvotenum(params.originalurl, votenum)
    def process(self, proparam):
        Logger.getlogging().info(proparam.url)
        try:
            if proparam.step is jiemianComments.STEP_1:
                # 取得url中的id
                articleId = re.findall(r'^http://www\.jiemian\.com/\w+/(\d+)', proparam.url).__getitem__(0)
                # 设置clicknum
                self.setclick(proparam)
                # 取得评论个数
                comments_count = float(re.findall(r'"comment_count">(\d+)</span>', proparam.content).__getitem__(0))
                if comments_count:
                    NewsStorage.setcmtnum(proparam.originalurl, comments_count)
                # 取得评论件数
                if int(comments_count) == 0:
                    return

                # 增量判断
                cmtnum = CMTStorage.getcount(proparam.originalurl, True)
                if cmtnum >= comments_count:
                    return
                page_num = int(math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE))
                if page_num >= self.maxpages:
                    page_num = self.maxpages
                # 循环取得评论的url
                for page in range(1, page_num + 1, 1):
                    url = jiemianComments.COMMENTS_URL % (articleId, page)
                    self.storeurl(url, proparam.originalurl, jiemianComments.STEP_3)
            elif proparam.step == jiemianComments.STEP_3:
                # proparam.content = proparam.content.replace('\\','')
                # soup = BeautifulSoup(proparam.content, 'html5lib')
                # items = soup.select('.comment-post')
                # for item in items:
                #     content = item.select_one('.comment-main > p').get_text().encode('utf-8')
                #     curtime = TimeUtility.getuniformtime(item.select_one('.date').get_text())
                #     nick = item.select_one('.author-name').get_text().decode('utf-8').encode('utf-8')
                # 取得点赞数
                votenum = self.r.getid('ding', proparam.content)
                if votenum == '':
                    Logger.getlogging().debug("Unable to get playcount")
                else:
                    NewsStorage.setvotenum(proparam.originalurl, votenum)
                # 取得评论的正则表达式
                comments = re.findall(r'<p>(.+?)<\\/p>', proparam.content)
                ctime = re.findall(r'<span class=\\"date\\">(.+?)<\\/span>',proparam.content)
                nicks = re.findall(r'class=\\"author-name\\">(.+?)<\\/a>', proparam.content)

                # 取得评论
                for index in range(0,len(comments)):
                    time = ctime[index].replace('\\', '')
                    curtime = TimeUtility.getuniformtime(time)
                    content = eval('u"' + comments[index] + '"').encode('utf-8')
                    nick = eval('u"' + nicks[index] + '"').encode('utf-8')
                    if not CMTStorage.exist(proparam.originalurl, content, curtime, nick):
                        CMTStorage.storecmt(proparam.originalurl, content, curtime, nick)
            else:
                Logger.getlogging().error("proparam.step == %d", proparam.step)


        except Exception, e:
            traceback.print_exc()
 def setplayinfo(self, params):
     try:
         topic_id = params.customized['topic_id']
         jsondata = json.loads(params.content)
         votenum = jsondata[topic_id]['pos']
         NewsStorage.setvotenum(params.originalurl, votenum)
     except:
         Logger.getlogging().debug(params.originalurl)
Exemplo n.º 4
0
    def process(self, params):
        try:
            if params.step is YoukuComments.STEP_1:
                # 从url中获取拼接评论url的参数
                objectId = self.r.getid('videoId', params.content, '\s*:\s*"')
                pTime = str(
                    int(
                        time.mktime(
                            datetime.datetime.timetuple(
                                datetime.datetime.now())) * 1000))
                #获取参数中的随机数
                sign = MD5().m(
                    '100-DDwODVkv&6c4aa6af6560efff5df3c16c704b49f1&' + pTime)
                # 拼接第一页评论url
                comments_url = YoukuComments.COMMENTS_URL % (
                    objectId, 1, YoukuComments.PAGE_SIZE, sign, pTime)
                #通知下载平台,根据评论url获取第一页评论内容
                self.storeurl(comments_url, params.originalurl,
                              YoukuComments.STEP_2, {'objectId': objectId})

                # 来疯吧直播播放量
                if self.r.search(r'^http://v\.laifeng\.com/\d+',
                                 params.originalurl):
                    clicknum = int(self.r.getid('onlineNum', params.content))
                    NewsStorage.setclicknum(params.originalurl, clicknum)

                if objectId:
                    playinfo_url = YoukuComments.PLAYINFO_URL.format(
                        vid=objectId)
                    self.storeurl(playinfo_url, params.originalurl,
                                  YoukuComments.STEP_2, {'objectId': objectId})
            #获取第一页评论内容,循环获取全部评论url
            elif params.step == YoukuComments.STEP_2:
                if re.findall('getVideoPlayInfo\?vid', params.url):
                    playinfo = json.loads((params.content)[20:-2])
                    clicknum = int(playinfo['data']['stat']['vv'].replace(
                        ',', ''))
                    votenum = int(playinfo['data']['updown']['up'].replace(
                        ',', ''))
                    NewsStorage.setclicknum(params.originalurl, clicknum)
                    NewsStorage.setvotenum(params.originalurl, votenum)
                else:
                    objectId = params.customized['objectId']
                    pTime = str(
                        int(
                            time.mktime(
                                datetime.datetime.timetuple(
                                    datetime.datetime.now())) * 1000))
                    # 获取参数中的随机数
                    sign = MD5().m(
                        '100-DDwODVkv&6c4aa6af6560efff5df3c16c704b49f1&' +
                        pTime)
                    # 获取评论的Jason返回值
                    comments = json.loads(params.content)
                    # 比较上次抓取该url的页面评论量和当前取到的评论量
                    if not comments.has_key('data'):
                        Logger.getlogging().warning(
                            "{url}:30000 No comments!".format(
                                url=params.originalurl))
                        return
                    if not comments['data']:
                        Logger.getlogging().warning(
                            "{url}:30000 No comments!".format(
                                url=params.originalurl))
                        return

                    # 判断增量
                    comments_count = comments['data']['totalSize']
                    cmtnum = CMTStorage.getcount(params.originalurl, True)
                    if int(comments_count <= cmtnum):
                        return
                    NewsStorage.setcmtnum(params.originalurl, comments_count)

                    # 获取评论总页数
                    comments_pages = int(comments['data']['totalPage'])
                    if comments_pages == 0:
                        return
                    # 如果评论数量过多只取前十页
                    if comments_pages >= self.maxpages:
                        comments_pages = self.maxpages

                    lasttime = CMTStorage.getlastpublish(
                        params.originalurl, True)
                    # 循环拼接评论url,提交下载平台获取评论数据
                    for page in range(0, comments_pages + 1, 1):
                        commentUrl = YoukuComments.COMMENTS_URL % (
                            objectId, page + 1, YoukuComments.PAGE_SIZE, sign,
                            pTime)
                        self.storeurl(commentUrl, params.originalurl,
                                      YoukuComments.STEP_3,
                                      {'objectId': objectId})

                    NewsStorage.setcmtnum(params.originalurl,
                                          int(comments['data']['totalSize']))

            #解析评论数据
            elif params.step == YoukuComments.STEP_3:
                commentsinfo = json.loads(params.content)
                for comment in commentsinfo['data']['comment']:
                    content = str(comment['content'])
                    curtime = TimeUtility.getuniformtime(
                        int(comment['createTime']))
                    nick = comment['user']['userName']
                    # 通过时间判断评论增量
                    # if curtime > lasttime:
                    if not CMTStorage.exist(params.originalurl, content,
                                            curtime, nick):
                        CMTStorage.storecmt(params.originalurl, content,
                                            curtime, nick)
        except:
            Logger.printexception()