def setclick(self, params): playcount = self.r.getid('play_count', params.content) votenum = self.r.getid('up', params.content) if playcount: NewsStorage.setclicknum(params.originalurl, playcount) if votenum: NewsStorage.setvotenum(params.originalurl, votenum)
def process(self, proparam): Logger.getlogging().info(proparam.url) try: if proparam.step is jiemianComments.STEP_1: # 取得url中的id articleId = re.findall(r'^http://www\.jiemian\.com/\w+/(\d+)', proparam.url).__getitem__(0) # 设置clicknum self.setclick(proparam) # 取得评论个数 comments_count = float(re.findall(r'"comment_count">(\d+)</span>', proparam.content).__getitem__(0)) if comments_count: NewsStorage.setcmtnum(proparam.originalurl, comments_count) # 取得评论件数 if int(comments_count) == 0: return # 增量判断 cmtnum = CMTStorage.getcount(proparam.originalurl, True) if cmtnum >= comments_count: return page_num = int(math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages # 循环取得评论的url for page in range(1, page_num + 1, 1): url = jiemianComments.COMMENTS_URL % (articleId, page) self.storeurl(url, proparam.originalurl, jiemianComments.STEP_3) elif proparam.step == jiemianComments.STEP_3: # proparam.content = proparam.content.replace('\\','') # soup = BeautifulSoup(proparam.content, 'html5lib') # items = soup.select('.comment-post') # for item in items: # content = item.select_one('.comment-main > p').get_text().encode('utf-8') # curtime = TimeUtility.getuniformtime(item.select_one('.date').get_text()) # nick = item.select_one('.author-name').get_text().decode('utf-8').encode('utf-8') # 取得点赞数 votenum = self.r.getid('ding', proparam.content) if votenum == '': Logger.getlogging().debug("Unable to get playcount") else: NewsStorage.setvotenum(proparam.originalurl, votenum) # 取得评论的正则表达式 comments = re.findall(r'<p>(.+?)<\\/p>', proparam.content) ctime = re.findall(r'<span class=\\"date\\">(.+?)<\\/span>',proparam.content) nicks = re.findall(r'class=\\"author-name\\">(.+?)<\\/a>', proparam.content) # 取得评论 for index in range(0,len(comments)): time = ctime[index].replace('\\', '') curtime = TimeUtility.getuniformtime(time) content = eval('u"' + comments[index] + '"').encode('utf-8') nick = eval('u"' + nicks[index] + '"').encode('utf-8') if not CMTStorage.exist(proparam.originalurl, content, curtime, nick): CMTStorage.storecmt(proparam.originalurl, content, curtime, nick) else: Logger.getlogging().error("proparam.step == %d", proparam.step) except Exception, e: traceback.print_exc()
def setplayinfo(self, params): try: topic_id = params.customized['topic_id'] jsondata = json.loads(params.content) votenum = jsondata[topic_id]['pos'] NewsStorage.setvotenum(params.originalurl, votenum) except: Logger.getlogging().debug(params.originalurl)
def process(self, params): try: if params.step is YoukuComments.STEP_1: # 从url中获取拼接评论url的参数 objectId = self.r.getid('videoId', params.content, '\s*:\s*"') pTime = str( int( time.mktime( datetime.datetime.timetuple( datetime.datetime.now())) * 1000)) #获取参数中的随机数 sign = MD5().m( '100-DDwODVkv&6c4aa6af6560efff5df3c16c704b49f1&' + pTime) # 拼接第一页评论url comments_url = YoukuComments.COMMENTS_URL % ( objectId, 1, YoukuComments.PAGE_SIZE, sign, pTime) #通知下载平台,根据评论url获取第一页评论内容 self.storeurl(comments_url, params.originalurl, YoukuComments.STEP_2, {'objectId': objectId}) # 来疯吧直播播放量 if self.r.search(r'^http://v\.laifeng\.com/\d+', params.originalurl): clicknum = int(self.r.getid('onlineNum', params.content)) NewsStorage.setclicknum(params.originalurl, clicknum) if objectId: playinfo_url = YoukuComments.PLAYINFO_URL.format( vid=objectId) self.storeurl(playinfo_url, params.originalurl, YoukuComments.STEP_2, {'objectId': objectId}) #获取第一页评论内容,循环获取全部评论url elif params.step == YoukuComments.STEP_2: if re.findall('getVideoPlayInfo\?vid', params.url): playinfo = json.loads((params.content)[20:-2]) clicknum = int(playinfo['data']['stat']['vv'].replace( ',', '')) votenum = int(playinfo['data']['updown']['up'].replace( ',', '')) NewsStorage.setclicknum(params.originalurl, clicknum) NewsStorage.setvotenum(params.originalurl, votenum) else: objectId = params.customized['objectId'] pTime = str( int( time.mktime( datetime.datetime.timetuple( datetime.datetime.now())) * 1000)) # 获取参数中的随机数 sign = MD5().m( '100-DDwODVkv&6c4aa6af6560efff5df3c16c704b49f1&' + pTime) # 获取评论的Jason返回值 comments = json.loads(params.content) # 比较上次抓取该url的页面评论量和当前取到的评论量 if not comments.has_key('data'): Logger.getlogging().warning( "{url}:30000 No comments!".format( url=params.originalurl)) return if not comments['data']: Logger.getlogging().warning( "{url}:30000 No comments!".format( url=params.originalurl)) return # 判断增量 comments_count = comments['data']['totalSize'] cmtnum = CMTStorage.getcount(params.originalurl, True) if int(comments_count <= cmtnum): return NewsStorage.setcmtnum(params.originalurl, comments_count) # 获取评论总页数 comments_pages = int(comments['data']['totalPage']) if comments_pages == 0: return # 如果评论数量过多只取前十页 if comments_pages >= self.maxpages: comments_pages = self.maxpages lasttime = CMTStorage.getlastpublish( params.originalurl, True) # 循环拼接评论url,提交下载平台获取评论数据 for page in range(0, comments_pages + 1, 1): commentUrl = YoukuComments.COMMENTS_URL % ( objectId, page + 1, YoukuComments.PAGE_SIZE, sign, pTime) self.storeurl(commentUrl, params.originalurl, YoukuComments.STEP_3, {'objectId': objectId}) NewsStorage.setcmtnum(params.originalurl, int(comments['data']['totalSize'])) #解析评论数据 elif params.step == YoukuComments.STEP_3: commentsinfo = json.loads(params.content) for comment in commentsinfo['data']['comment']: content = str(comment['content']) curtime = TimeUtility.getuniformtime( int(comment['createTime'])) nick = comment['user']['userName'] # 通过时间判断评论增量 # if curtime > lasttime: if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) except: Logger.printexception()