def step1(self, params): """获取评论的首页url""" try: #获取上一次的最新更新时间 before_update = CMTStorage.getlastpublish(params.originalurl) #获取其他信息,拼接url url_id = None if self.r.search('^http[s]{0,1}://v\.qq\.com/.*', params.originalurl): #{"comment_id":"1167760750","result":{"code":0,"msg":"Success!","ret":0},"srcid":"c0016r7fo07","srcid_type":1001} url_id = self.r.getid('comment_id', params.content) else: url_id = self.r.getid('cmt_id', params.content) if not url_id: url_id = self.r.getid('aid', params.content) if not url_id: url_id = self.r.getid('commId', params.content) if url_id: comment_url = self.COMMENTS_URL.format(url_id, 0, self.page_size) self.storeurl( comment_url, params.originalurl, self.STEP_COMMENT_NEXT_PAGE, { 'url_id': url_id, 'comment_id': 0, 'before_update': before_update }) except: Logger.printexception()
def step2(self, params): try: Logger.getlogging().info("Kr36Comments.STEP_2") # 将STEP_1中的cid传下来 cid = params.customized['cid'] jsoncontent = json.loads(params.content) comments_count = jsoncontent['data']['total_items'] page_count = jsoncontent['data']['total_pages'] # 判断增量 cmtnum = CMTStorage.getcount(params.originalurl) if cmtnum >= comments_count: return #最多只取十页评论 # page_num = int(math.ceil(float(comments_count - cmtnum) / self.page_size)) if page_count >= self.maxpages: page_count = self.maxpages lasttime = CMTStorage.getlastpublish(params.originalurl,True) for page in range(1, page_count+1, 1): commentinfo_url = Kr36Comments.COMMENT_URL.format(cid, self.page_size, page) self.storeurl(commentinfo_url, params.originalurl, Kr36Comments.STEP_3,lasttime) except: Logger.printexception()
def process_book(self, params): try: if params.step == Comments.STEP_1: # 从url中获取拼接评论url的参数 bookId = self.r.parse('^http://www\.17k\.com/book/(\w+).html$', params.originalurl)[0] # 拼接第一页评论url comments_url = Comments.COMMENTS_URL % (bookId, 1, Comments.PAGE_SIZE) #通知下载平台,根据评论url获取第一页评论内容 self.storeurl(comments_url, params.originalurl, Comments.STEP_2, {'bookId': bookId}) #获取第一页评论内容,循环获取全部评论url elif params.step == Comments.STEP_2: bookId = params.customized['bookId'] # 获取评论的Jason返回值 comments = json.loads(params.content) comments_count = int(comments['page']['count']) # 判断增量 cmtnum = CMTStorage.getcount(params.originalurl) if cmtnum >= comments_count: return NewsStorage.setcmtnum(params.originalurl, comments_count) # 获取评论最后更新时间 lasttime = CMTStorage.getlastpublish(params.originalurl, True) # 获取评论页数 page_count = int(comments['page']['pagecount']) if page_count == 0: return if page_count >= self.maxpages: page_count = self.maxpages # 循环拼接评论url,提交下载平台获取评论数据 for page in range(1, page_count + 1, 1): commentUrl = Comments.COMMENTS_URL % (bookId, page, Comments.PAGE_SIZE) self.storeurl(commentUrl, params.originalurl, Comments.STEP_3, {'bookId': bookId}) #解析评论数据 elif params.step == Comments.STEP_3: commentsinfo = json.loads(params.content) for comment in commentsinfo['page']['result']: curtime = TimeUtility.getuniformtime( comment['creationDate']) content = comment['summary'] nick = comment['marks']['nikeName'] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) except Exception, e: traceback.print_exc()
def process(self, params): try: if params.step is YoukuComments.STEP_1: # 从url中获取拼接评论url的参数 objectId = self.r.getid('videoId', params.content, '\s*:\s*"') pTime = str( int( time.mktime( datetime.datetime.timetuple( datetime.datetime.now())) * 1000)) #获取参数中的随机数 sign = MD5().m( '100-DDwODVkv&6c4aa6af6560efff5df3c16c704b49f1&' + pTime) # 拼接第一页评论url comments_url = YoukuComments.COMMENTS_URL % ( objectId, 1, YoukuComments.PAGE_SIZE, sign, pTime) #通知下载平台,根据评论url获取第一页评论内容 self.storeurl(comments_url, params.originalurl, YoukuComments.STEP_2, {'objectId': objectId}) # 来疯吧直播播放量 if self.r.search(r'^http://v\.laifeng\.com/\d+', params.originalurl): clicknum = int(self.r.getid('onlineNum', params.content)) NewsStorage.setclicknum(params.originalurl, clicknum) if objectId: playinfo_url = YoukuComments.PLAYINFO_URL.format( vid=objectId) self.storeurl(playinfo_url, params.originalurl, YoukuComments.STEP_2, {'objectId': objectId}) #获取第一页评论内容,循环获取全部评论url elif params.step == YoukuComments.STEP_2: if re.findall('getVideoPlayInfo\?vid', params.url): playinfo = json.loads((params.content)[20:-2]) clicknum = int(playinfo['data']['stat']['vv'].replace( ',', '')) votenum = int(playinfo['data']['updown']['up'].replace( ',', '')) NewsStorage.setclicknum(params.originalurl, clicknum) NewsStorage.setvotenum(params.originalurl, votenum) else: objectId = params.customized['objectId'] pTime = str( int( time.mktime( datetime.datetime.timetuple( datetime.datetime.now())) * 1000)) # 获取参数中的随机数 sign = MD5().m( '100-DDwODVkv&6c4aa6af6560efff5df3c16c704b49f1&' + pTime) # 获取评论的Jason返回值 comments = json.loads(params.content) # 比较上次抓取该url的页面评论量和当前取到的评论量 if not comments.has_key('data'): Logger.getlogging().warning( "{url}:30000 No comments!".format( url=params.originalurl)) return if not comments['data']: Logger.getlogging().warning( "{url}:30000 No comments!".format( url=params.originalurl)) return # 判断增量 comments_count = comments['data']['totalSize'] cmtnum = CMTStorage.getcount(params.originalurl, True) if int(comments_count <= cmtnum): return NewsStorage.setcmtnum(params.originalurl, comments_count) # 获取评论总页数 comments_pages = int(comments['data']['totalPage']) if comments_pages == 0: return # 如果评论数量过多只取前十页 if comments_pages >= self.maxpages: comments_pages = self.maxpages lasttime = CMTStorage.getlastpublish( params.originalurl, True) # 循环拼接评论url,提交下载平台获取评论数据 for page in range(0, comments_pages + 1, 1): commentUrl = YoukuComments.COMMENTS_URL % ( objectId, page + 1, YoukuComments.PAGE_SIZE, sign, pTime) self.storeurl(commentUrl, params.originalurl, YoukuComments.STEP_3, {'objectId': objectId}) NewsStorage.setcmtnum(params.originalurl, int(comments['data']['totalSize'])) #解析评论数据 elif params.step == YoukuComments.STEP_3: commentsinfo = json.loads(params.content) for comment in commentsinfo['data']['comment']: content = str(comment['content']) curtime = TimeUtility.getuniformtime( int(comment['createTime'])) nick = comment['user']['userName'] # 通过时间判断评论增量 # if curtime > lasttime: if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) except: Logger.printexception()
def process(self, params): try: if params.step is TudouComments.STEP_1: # 从url中获取拼接评论url的参数 objectId = self.r.getid('vid', params.content, '\s*:\s*"') pTime = str( int( time.mktime( datetime.datetime.timetuple( datetime.datetime.now())) * 1000)) # 获取参数中的随机数 sign = MD5().m( '100-DDwODVkv&6c4aa6af6560efff5df3c16c704b49f1&' + pTime) # 拼接第一页评论url comments_url = TudouComments.COMMENTS_URL % ( objectId, 1, TudouComments.PAGE_SIZE, sign, pTime) # 通知下载平台,根据评论url获取第一页评论内容 self.storeurl(comments_url, params.originalurl, TudouComments.STEP_2, {'objectId': objectId}) elif params.step is TudouComments.STEP_2: objectId = params.customized['objectId'] pTime = str( int( time.mktime( datetime.datetime.timetuple( datetime.datetime.now())) * 1000)) # 获取参数中的随机数 sign = MD5().m( '100-DDwODVkv&6c4aa6af6560efff5df3c16c704b49f1&' + pTime) # 获取评论的Jason返回值 comments = json.loads(params.content) # 比较上次抓取该url的页面评论量和当前取到的评论量 if not comments.has_key('data'): Logger.getlogging().warning( "{url}:30000 No comments!".format( url=params.originalurl)) return if not comments['data']: Logger.getlogging().warning( "{url}:30000 No comments!".format( url=params.originalurl)) return # 判断增量 comments_count = comments['data']['totalSize'] cmtnum = CMTStorage.getcount(params.originalurl, True) if int(comments_count <= cmtnum): return NewsStorage.setcmtnum(params.originalurl, comments_count) # 获取评论总页数 comments_pages = int(comments['data']['totalPage']) if comments_pages == 0: return # 如果评论数量过多只取前十页 if comments_pages >= self.maxpages: comments_pages = self.maxpages lasttime = CMTStorage.getlastpublish(params.originalurl, True) # 循环拼接评论url,提交下载平台获取评论数据 for page in range(0, comments_pages + 1, 1): commentUrl = TudouComments.COMMENTS_URL % ( objectId, page + 1, TudouComments.PAGE_SIZE, sign, pTime) self.storeurl(commentUrl, params.originalurl, TudouComments.STEP_3, {'objectId': objectId}) elif params.step is TudouComments.STEP_3: commentsinfo = json.loads(params.content) for comment in commentsinfo['data']['comment']: content = comment['content'] curtime = TimeUtility.getuniformtime( int(comment['createTime'])) nick = comment['user']['userName'] # 通过时间判断评论增量 # if curtime > lasttime: if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) except: Logger.printexception()
def isnewesttime(self, url, curtime): if curtime > CMTStorage.getlastpublish(url): return True return False