def step2news(self, params): curcmtnum = self.r.parse("\"total\"\:(\d+)", params.content)[0] page_count = self.r.parse("\"total_page\"\:(\d+)", params.content)[0] threadid = self.r.parse("\"thread_id\"\:\"(\d+)\"", params.content)[0] objectid = self.r.parse("\"object_id\"\:\"(\d+)\"", params.content)[0] curcmtnum = int(curcmtnum) NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return # 循环取得评论的url pages = int(math.ceil(float(curcmtnum - dbcmtnum) / self.page_size)) if pages >= self.maxpages: pages = self.maxpages for page in range(1, pages + 1, 1): if page == 1: self.step3news(params) continue comment_url = U17NewsComments.COMMENT_URL_NEWS.format( threadid=threadid, objectid=objectid, page=page, pagesize=self.page_size, comicid=objectid) self.storeurl(comment_url, params.originalurl, U17NewsComments.STEP_3)
def step2(self, params): Logger.getlogging().info("LaohuComments.STEP_2") token = params.customized['token'] sourceId = params.customized['sourceId'] xhtml = XPathUtility(html=params.content) # 网友评论(32) countstr = xhtml.getlist('//*[@class="filter-by-type"]')[0] comment_counts = int(countstr[5:countstr.__len__()-1]) if comment_counts: NewsStorage.setcmtnum(params.originalurl, comment_counts) if comment_counts == 0: Logger.getlogging().warning('{url}:30000 No comments'.format(url=params.originalurl)) return cmtnum = CMTStorage.getcount(params.originalurl, True) # 判断增量 if cmtnum >= comment_counts: #Logger.getlogging().warning('{url}:30000 No comments'.format(url=params.originalurl)) return page_num = int(math.ceil(float(comment_counts - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages # 获取第一页评论内容 self.getComments(params) if comment_counts > 15: # 循环拼接评论url,提交下载平台获取评论数据 COMMENTS_URL = 'http://member.laohu.com/comment/ajax?page=%d&token=%s&order=new' for page in range(2, page_num + 1, 1): commentUrl = LaohuComments.COMMENTS_URL % (page, sourceId) self.storeurl(commentUrl, params.originalurl, LaohuComments.STEP_3, {'token' : token, 'sourceId':sourceId})
def step2_ebook(self, params): try: #"""只适用在QQ阅读部分,获取评论的url列表""" bid = params.customized['bid'] jsoncontent = json.loads(params.content) if not jsoncontent.has_key('data'): Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_COMMNETS) return comments_count = jsoncontent['data']['total'] page_count = jsoncontent['data']['pageCount'] # 判断增量 cmtnum = CMTStorage.getcount(params.originalurl, True) NewsStorage.setcmtnum(params.originalurl, comments_count) if cmtnum >= comments_count: return # 判断10页 if int(page_count) >= self.maxpages: page_count = self.maxpages for page in range(1, page_count + 1, 1): commentinfo_url = self.EBOOK_COMMENTS_URL.format(site='intro', bid=bid, page=page) self.storeurl(commentinfo_url, params.originalurl, self.STEP_COMMENT_NEXT_PAGE) except Exception, e: Logger.printexception()
def step2(self, params): Logger.getlogging().info("MkzhanComments.STEP_2") comic_id = params.customized['comic_id'] # aboutid = params.customized['aboutid'] comments = json.loads(params.content) comments_count = int(comments['data']['count']) cmtnum = CMTStorage.getcount(params.originalurl, True) # 获取第一页的内容 for it in comments['data']['list']: content = it['content'] curtime = TimeUtility.getuniformtime(it['create_time']) nick = it['username'] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) # 设置cmtnum NewsStorage.setcmtnum(params.originalurl, comments_count) if cmtnum >= comments_count: Logger.getlogging().warning( '{url}:30000 No comments'.format(url=params.originalurl)) return page_num = int( math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages for page in range(2, page_num + 1, 1): comments_url = MkzhanComments.COMMENTS_URL % (comic_id, page, self.PAGE_SIZE) self.storeurl(comments_url, params.originalurl, MkzhanComments.STEP_3)
def step2(self, params): """获取评论的其他url""" try: comments = json.loads(params.content) topic_id = comments['topic_id'] curcmtnum = float(comments.get('cmt_sum', -1)) #clicknum = float(comments.get('participation_sum',-1)) NewsStorage.setcmtnum(params.originalurl, curcmtnum) #NewsStorage.setclicknum(params.originalurl, clicknum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return page_num = int( math.ceil(float(curcmtnum - dbcmtnum) / self.page_size)) if page_num >= self.maxpages: page_num = self.maxpages for page in range(1, page_num + 1): if self.r.search('http[s]{0,1}://.*tv\.sohu.com/.*', params.originalurl): url = self.COMMENTS_URL.format(self.tv_client_id, topic_id, page, self.tv_page_size) else: url = self.COMMENTS_URL.format(self.client_id, topic_id, page, self.page_size) self.storeurl(url, params.originalurl, self.STEP_COMMENT_NEXT_PAGE) except: Logger.printexception() Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_SITE)
def setp_2(self, params): # 取得评论件数 comments = json.loads(params.content) comments_count = float(comments['total']) NewsStorage.setcmtnum(params.originalurl, comments_count) if int(comments_count) == 0: return # 判断是否有增量 cmtnum = CMTStorage.getcount(params.originalurl, True) if cmtnum >= comments_count: return page_num = int( math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages # 取得评论参数 pid = params.customized['pid'] # 综艺模式取得评论url if re.match(r'^http://zongyi\.le\.com/.*', params.url): for page in range(1, page_num + 1, 1): if page == 1: self.geturlcomments(params) continue url = LeComments.COMMENTS_URL_ZONGYI1 % (page, pid) self.storeurl(url, params.originalurl, LeComments.STEP_4) else: for page in range(1, page_num + 1, 1): if page == 1: self.geturlcomments(params) continue url = LeComments.COMMENTS_URL_TV % (page, pid) self.storeurl(url, params.originalurl, LeComments.STEP_4)
def step2(self, params): # 取得client_id liteloadApi = params.customized['liteloadApi'] client_id = params.customized['client_id'] topic_url = params.customized['topic_url'] commentsApi = params.customized['commentsApi'] # 取得评论个数 content = json.loads(params.content) curcmtnum = float(content.get('cmt_sum',0)) NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return # 取得topicId topic_id = content.get('topic_id','') if not topic_id: Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_XPATHVALUE) return # 循环取得评论的url pages = int(math.ceil(float(curcmtnum - dbcmtnum) / ChangyanComments.PAGE_SIZE)) if pages >= self.maxpages: pages = self.maxpages for page in range(1, pages + 1, 1): # 取得评论的url #COMMENTS_URL = 'http://changyan.sohu.com/api/{commentsApi}/topic/comments?client_id={client_id}&page_no={page_no}&page_size={page_size}&topic_id={topic_id}' url = ChangyanComments.COMMENTS_URL.format(commentsApi=commentsApi, client_id=client_id, page_no = page, page_size = ChangyanComments.PAGE_SIZE, topic_id=topic_id, ) self.storeurl(url, params.originalurl, ChangyanComments.STEP_3)
def step2_ifeng_xiaobg(self, params): try: jsoncontent = json.loads(params.content) clicknum = float(jsoncontent.get('join_count', '-1')) if clicknum > 0: NewsStorage.setclicknum(params.originalurl, clicknum) curcmtnum = jsoncontent['count'] NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return # 循环取得评论的url pages = int(math.ceil( float(curcmtnum - dbcmtnum) / self.page_size)) if pages >= self.maxpages: pages = self.maxpages for index in range(1, pages + 1, 1): if index == 1: self.ifengnews_step3(params) continue self.post_data['p'] = index self.storeposturl(self.post_url, params.originalurl, self.IFENG_NEWS_NEXT_PAGE, IfengNewsComments.post_data) except: Logger.printexception()
def step2(self, params): try: Logger.getlogging().info("xinhuaComments.STEP_2") # 将STEP_1中的commentinfo_url传下来 newsId = params.customized['newsId'] comments_info = json.loads(params.content) comments_count = comments_info['totalRows'] NewsStorage.setcmtnum(params.originalurl, comments_count) page_count = comments_info['totalPage'] # 判断增量 cmtnum = CMTStorage.getcount(params.originalurl, True) if cmtnum >= comments_count: return # 判断增量 if page_count >= self.maxpages: page_count = self.maxpages for index in range(0, int(page_count)): commentinfo_url = xinhuaNewsComments.COMMENTS_URL_NEWS.format( newsId=newsId, pid=(index + 1)) self.storeurl(commentinfo_url, params.originalurl, xinhuaNewsComments.STEP_3) except: Logger.printexception()
def step2bbs(self, params): Logger.getlogging().info("JoyComments.STEP_2") topic_id = params.customized['topic_id'] domain = params.customized['domain'] try: commentsinfo = json.loads(params.content) comments_count = commentsinfo['result']['mainreplys']['page'][ 'totalRows'] NewsStorage.setcmtnum(params.originalurl, comments_count) except: Logger.getlogging().warning( '{url} Errorcode:40000'.format(url=params.originalurl)) #Logger.printexception() return # 保存页面评论量 cmtnum = CMTStorage.getcount(params.originalurl, True) # 计算增量 if cmtnum >= comments_count: return page_num = int( math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages for index in range(1, page_num + 1, 1): commentinfo_url = JoyComments.COMMENT_URL.format(topic_id=topic_id, domain=domain, page=index) self.storeurl(commentinfo_url, params.originalurl, JoyComments.STEP_3_BBS)
def step2(self, params): """""" try: threadId = params.customized['threadId'] jsondata = json.loads(params.content) comment_totalnum = jsondata['tcount'] NewsStorage.setcmtnum(params.originalurl, comment_totalnum) except: Logger.getlogging().warning('{}:30000 No comments'.format( params.originalurl)) return #更新数据库 cmtnum = CMTStorage.getcount(params.originalurl, True) if cmtnum >= int(comment_totalnum): return max = int( math.ceil(float(comment_totalnum - cmtnum) / VComments.limit)) if max > self.maxpages: max = self.maxpages #if offsets == 1: #self.step3(params) for offset in range(1, max + 1, 1): if page == 1: self.step3(params) continue comment_url = VComments.COMMENT_URL.format(threadId=threadId, limit=VComments.limit, offset=offset * VComments.limit) self.storeurl(comment_url, params.originalurl, VComments.V_STEP_3, {'threadId': threadId})
def getcomments_step2(self, params): bookId = params.customized['bookId'] xhtml = XPathUtility(html=params.content) page_counts = int(xhtml.xpath('//div[@class="page"]/@pagenum')[0]) comments_count = int(xhtml.xpath('//div[@class="page"]/@total')[0]) Logger.getlogging().debug(comments_count) if page_counts == 0: return # 判断增量 cmtnum = CMTStorage.getcount(params.originalurl, True) if cmtnum >= comments_count: return page_num = int( math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages NewsStorage.setcmtnum(params.originalurl, comments_count) for page in range(1, page_num + 1, 1): comment_url = PubComments.COMMENTS_URL self.storeposturl(comment_url, params.originalurl, PubComments.STEP_3, { 'bookId': bookId, 'pageNum': page })
def step1(self, params): # 取得url中的id articleId = self.r.parse(r'^https://movie\.douban\.com/\w+/(\d+)', params.url)[0] # 取得评论件数 xpathobj = XPathUtility(params.content) text = xpathobj.getstring( xpath='//*[@id="comments-section"]//h2/*[@class="pl"]/a') numtext = self.r.parse('\d+', text) if not numtext: return curcmtnum = float(numtext[0]) NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return # 循环取得评论的url pages = int(math.ceil(float(curcmtnum - dbcmtnum) / self.PAGE_SIZE)) if pages >= self.maxpages: pages = self.maxpages for page in range(1, pages + 1, 1): url = doubanComments.COMMENTS_URL.format(articleId=articleId, start=(page - 1) * self.PAGE_SIZE, pagesize=self.PAGE_SIZE) self.storeurl(url, params.originalurl, doubanComments.STEP_2)
def step2(self, params): qitanid = params.customized['qitanid'] tvid = params.customized['tvid'] comments = json.loads(params.content) curcmtnum = float(comments['data']['count']) NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return # 循环取得评论的url pages = int( math.ceil(float(curcmtnum - dbcmtnum) / self.DEFAULT_PAGE_SIZE)) if pages >= self.maxpages: pages = self.maxpages for page in range(1, pages + 1, 1): if int(qitanid): url = IqiyiComments.COMMENTS_URL1.format( pageno=page, pagesize=IqiyiComments.DEFAULT_PAGE_SIZE, qitanid=qitanid, tvid=tvid) else: url = IqiyiComments.COMMENTS_URL2.format( pageno=page, pagesize=IqiyiComments.DEFAULT_PAGE_SIZE, tvid=tvid) self.storeurl(url, params.originalurl, IqiyiComments.STEP_3)
def process(self, proparam): Logger.getlogging().info(proparam.url) try: if proparam.step is jiemianComments.STEP_1: # 取得url中的id articleId = re.findall(r'^http://www\.jiemian\.com/\w+/(\d+)', proparam.url).__getitem__(0) # 设置clicknum self.setclick(proparam) # 取得评论个数 comments_count = float(re.findall(r'"comment_count">(\d+)</span>', proparam.content).__getitem__(0)) if comments_count: NewsStorage.setcmtnum(proparam.originalurl, comments_count) # 取得评论件数 if int(comments_count) == 0: return # 增量判断 cmtnum = CMTStorage.getcount(proparam.originalurl, True) if cmtnum >= comments_count: return page_num = int(math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages # 循环取得评论的url for page in range(1, page_num + 1, 1): url = jiemianComments.COMMENTS_URL % (articleId, page) self.storeurl(url, proparam.originalurl, jiemianComments.STEP_3) elif proparam.step == jiemianComments.STEP_3: # proparam.content = proparam.content.replace('\\','') # soup = BeautifulSoup(proparam.content, 'html5lib') # items = soup.select('.comment-post') # for item in items: # content = item.select_one('.comment-main > p').get_text().encode('utf-8') # curtime = TimeUtility.getuniformtime(item.select_one('.date').get_text()) # nick = item.select_one('.author-name').get_text().decode('utf-8').encode('utf-8') # 取得点赞数 votenum = self.r.getid('ding', proparam.content) if votenum == '': Logger.getlogging().debug("Unable to get playcount") else: NewsStorage.setvotenum(proparam.originalurl, votenum) # 取得评论的正则表达式 comments = re.findall(r'<p>(.+?)<\\/p>', proparam.content) ctime = re.findall(r'<span class=\\"date\\">(.+?)<\\/span>',proparam.content) nicks = re.findall(r'class=\\"author-name\\">(.+?)<\\/a>', proparam.content) # 取得评论 for index in range(0,len(comments)): time = ctime[index].replace('\\', '') curtime = TimeUtility.getuniformtime(time) content = eval('u"' + comments[index] + '"').encode('utf-8') nick = eval('u"' + nicks[index] + '"').encode('utf-8') if not CMTStorage.exist(proparam.originalurl, content, curtime, nick): CMTStorage.storecmt(proparam.originalurl, content, curtime, nick) else: Logger.getlogging().error("proparam.step == %d", proparam.step) except Exception, e: traceback.print_exc()
def step1(self, params): pattern = 'https://www.huxiu.com/article/(\d+).html' if not self.r.search(pattern, params.originalurl): Logger.log(params.url, constant.ERRORCODE_SITE_NOGET_TEMPLATE) return else: object_id = self.r.parse(pattern, params.originalurl)[0] curcmtnum = XPathUtility( params.content).getnumber('//*[@class="article-pl pull-left"]') if not curcmtnum: Logger.log(params.url, constant.ERRORCODE_SITE_NOGET_COMMNETS) return NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return # 循环取得评论的url pages = int(math.ceil(float(curcmtnum - dbcmtnum) / self.page_size)) if pages >= self.maxpages: pages = self.maxpages for page in range(1, pages + 1): #self.POST_DATA['object_id'] = object_id #self.POST_DATA['page'] = page #self.storeposturl(self.POST_URL, params.originalurl, HuxiupostComments.EACH, self.POST_DATA) commonurl = self.COMMONURL.format(object_id=object_id, page=page) self.storeurl(commonurl, params.originalurl, HuxiupostComments.EACH)
def step1(self, params): try: url = params.originalurl videoId = params.customized['videoId'] params.content = params.content[params.content.index('{'):params.content.rindex('}')+1] jsonData = json.loads(params.content)['data'] hasCmts = jsonData['page']['count'] # 是否有评论 if not hasCmts: return # 比较增量:获取现在评论数目及上次采集的评论数目 currCmtsCount = jsonData['page']['acount'] NewsStorage.setcmtnum(url, currCmtsCount) prevCmtsCount = int(CMTStorage.getcount(url)) # 若没有评论更新,跳过 if prevCmtsCount >= currCmtsCount: return # 更新评论数 pageNum = int(math.ceil((hasCmts-prevCmtsCount)/self.pageSize)) # 上次采集数据的截止时间 # 生成分页url并传递给共通模块 for page in range(1, pageNum + 1): if page == 1: self.step2(params) pageUrl = self.pageUrl.format(page = page, videoId = videoId) self.storeurl(pageUrl, url, self.STEP_CMTS) except: Logger.printexception()
def getComments(self, params, url): # 当前评论页码 pg = self.r.parse(url, params.url)[0] soup = BeautifulSoup(params.content, 'html5lib') # 帖子内容 infos = soup.select('tr > td.postcontent') # 发表时间,内容格式[发表于 2016-10-7 18:04:25] comments = [] # 第一页的第一条内容为正文 if pg == '1': start = 1 else: start = 0 for info in infos[start:]: # 取主评论 if info.select_one('div[class="postmessage defaultpost"]'): content = info.select_one('div[class="postmessage defaultpost"]').get_text()\ .replace('\t','').replace('\n','').replace(' ','').strip() updatetime = info.select_one( 'div.postinfo > font').get_text().strip()[4:] + ':00' curtime = getuniformtime(updatetime) nick = 'none' if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) comments_couts = CMTStorage.getcount(params.originalurl) NewsStorage.setcmtnum(params.originalurl, comments_couts)
def ifengnews_step2(self, params): try: oriurl = params.customized['oriurl'] jsoncontent = json.loads(params.content) clicknum = float(jsoncontent.get('join_count', '-1')) if clicknum > 0: NewsStorage.setclicknum(params.originalurl, clicknum) curcmtnum = float(jsoncontent['count']) NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return # 循环取得评论的url pages = int(math.ceil( float(curcmtnum - dbcmtnum) / self.page_size)) if pages >= self.maxpages: pages = self.maxpages # 拼出第一页之外的其他所有评论url for index in range(1, pages + 1, 1): if index == 1: self.ifengnews_step3(params) continue commentinfo_url = IfengNewsComments.COMMENTS_URL.format( oriurl=oriurl, pg=index, ps=self.page_size) self.storeurl(commentinfo_url, params.originalurl, IfengNewsComments.IFENG_NEWS_NEXT_PAGE) except: Logger.printexception()
def step2(self, params): jsondata = json.loads(params.content) if 'thread' not in jsondata: Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_COMMNETS) return threadid = jsondata['thread']['thread_id'] curcmtnum = int(jsondata['cursor']['total']) # 检查是否有评论数,没有,返回 if int(curcmtnum) == 0: Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_COMMNETS) return # 检查评论数是否增加,没有增加,返回;有增加,更新增加后的页面评论量 curcmtnum = int(curcmtnum) NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return pages = int( math.ceil(float(curcmtnum - dbcmtnum) / self.DEFAULT_PAGE_SIZE)) if pages >= self.maxpages: pages = self.maxpages for page in range(1, pages + 1, 1): url = NarutomVideoComments.COMMENTS_URL.format( threadid=threadid, limit=NarutomVideoComments.DEFAULT_PAGE_SIZE, page=page) self.storeurl(url, params.originalurl, NarutomVideoComments.STEP_3)
def step2(self, params): operaId = params.customized['operaId'] contentId = params.customized['contentId'] # 获取评论的Jason返回值 comments = json.loads(params.content) # 获取评论页数 curcmtnum = int(comments['pageTurn']['rowCount']) NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return # 循环取得评论的url pages = int(math.ceil(float(curcmtnum - dbcmtnum) / self.PAGE_SIZE)) if pages >= self.maxpages: pages = self.maxpages # 循环拼接评论url,提交下载平台获取评论数据 for page in range(1, pages + 1, 1): if page == 1: self.step3(params) commentUrl = Comments.COMMENTS_URL % (operaId, contentId, page, Comments.PAGE_SIZE) self.storeurl(commentUrl, params.originalurl, Comments.STEP_3, { 'operaId': operaId, 'contentId': contentId })
def setp_3(self, params): # 取得评论件数 comments = json.loads(params.content) comments_count = float(comments['total']) NewsStorage.setcmtnum(params.originalurl, comments_count) if int(comments_count) == 0: return # 判断是否有增量 cmtnum = CMTStorage.getcount(params.originalurl, True) if cmtnum >= comments_count: return page_num = int( math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages # 取得评论参数 cid = params.customized['cid'] xid = params.customized['xid'] pid = params.customized['pid'] # 取得评论url列表 for page in range(1, page_num + 1, 1): if page == 1: self.getcomments(params) continue url = LeComments.COMMENTS_URL % (cid, page, xid, pid) self.storeurl(url, params.originalurl, LeComments.STEP_4)
def step2_ac(self, params): """只适用在腾讯动漫视频部分,获取评论的url列表""" url_id = params.customized['url_id'] xhtml = etree.HTML(params.content) # 评论数量获取经常会参数错误 comments_count = xhtml.xpath( '//*[@id="pagination-node"]/span/em/text()') if comments_count: comments_count = int(comments_count[0]) else: Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_COMMNETS) return page_size = len(xhtml.xpath('//*[@class="comment-content-detail"]')) # 判断增量 cmtnum = CMTStorage.getcount(params.originalurl, True) NewsStorage.setcmtnum(params.originalurl, comments_count) if cmtnum >= comments_count: return page_num = int(math.ceil((float(comments_count) / page_size))) if int(page_num) >= self.maxpages: page_num = self.maxpages for page in range(1, page_num + 1): url = self.AC_COMMENTS_URL.format(url_id, page) self.storeurl(url, params.originalurl, self.STEP_COMMENT_NEXT_PAGE)
def process(self, proparam): Logger.getlogging().info(proparam.url) try: if proparam.step is LeiphoneComments.STEP_1: # 取得url中的id articleId = self.r.getid('data-article_id', proparam.content) comments_url = LeiphoneComments.COMMENTS_URL % (articleId) self.storeurl(comments_url, proparam.originalurl, LeiphoneComments.STEP_2, {'articleId': articleId}) elif proparam.step == LeiphoneComments.STEP_2: articleId = proparam.customized['articleId'] comments = proparam.content[ proparam.content.index('{'):proparam.content.rindex('}') + 1] comments = json.loads(comments) comments_count = float(comments['allCount']['num']) NewsStorage.setcmtnum(proparam.originalurl, comments_count) # 取得评论件数 if int(comments_count) == 0: return # 判断增量 cmtnum = CMTStorage.getcount(proparam.originalurl, True) if cmtnum >= comments_count: return # 取得评论 self.geturlcomments(proparam) # 取得评论url # comments_url = LeiphoneComments.COMMENTS_URL % (articleId) # self.storeurl(comments_url, proparam.originalurl, LeiphoneComments.STEP_3) elif proparam.step == LeiphoneComments.STEP_3: return # # 取得评论的正则表达式 # comments = re.findall(r'content":"(.+?)","paragraph_id"', proparam.content) # commentsInfo = [] # commentsTime = self.r.parse(r'origin_created":"(\d+)","member_avatarPath"', proparam.content) # # 取得评论 # index = 0 # for comment in comments: # comment = eval('u"' + comment + '"') # cmti = CommentInfo() # cmti.content = comment.encode('utf-8') # if URLStorage.storeupdatetime(proparam.originalurl, getuniformtime(commentsTime[index])): # commentsInfo.append(cmti) # index = index + 1 # # # 保存获取的评论 # if len(commentsInfo) > 0: # self.commentstorage.store(proparam.originalurl, commentsInfo) else: return except: Logger.printexception()
def process_book(self, params): try: if params.step == Comments.STEP_1: # 从url中获取拼接评论url的参数 bookId = self.r.parse('^http://www\.17k\.com/book/(\w+).html$', params.originalurl)[0] # 拼接第一页评论url comments_url = Comments.COMMENTS_URL % (bookId, 1, Comments.PAGE_SIZE) #通知下载平台,根据评论url获取第一页评论内容 self.storeurl(comments_url, params.originalurl, Comments.STEP_2, {'bookId': bookId}) #获取第一页评论内容,循环获取全部评论url elif params.step == Comments.STEP_2: bookId = params.customized['bookId'] # 获取评论的Jason返回值 comments = json.loads(params.content) comments_count = int(comments['page']['count']) # 判断增量 cmtnum = CMTStorage.getcount(params.originalurl) if cmtnum >= comments_count: return NewsStorage.setcmtnum(params.originalurl, comments_count) # 获取评论最后更新时间 lasttime = CMTStorage.getlastpublish(params.originalurl, True) # 获取评论页数 page_count = int(comments['page']['pagecount']) if page_count == 0: return if page_count >= self.maxpages: page_count = self.maxpages # 循环拼接评论url,提交下载平台获取评论数据 for page in range(1, page_count + 1, 1): commentUrl = Comments.COMMENTS_URL % (bookId, page, Comments.PAGE_SIZE) self.storeurl(commentUrl, params.originalurl, Comments.STEP_3, {'bookId': bookId}) #解析评论数据 elif params.step == Comments.STEP_3: commentsinfo = json.loads(params.content) for comment in commentsinfo['page']['result']: curtime = TimeUtility.getuniformtime( comment['creationDate']) content = comment['summary'] nick = comment['marks']['nikeName'] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) except Exception, e: traceback.print_exc()
def step2(self, params): """获取评论的其他url,及评论""" #每次spider运行的累加数据tempcmttotal # try: url_id = params.customized['url_id'] comment_id = params.customized['comment_id'] before_update = params.customized['before_update'] tempcmttotal = params.customized.get('tempcmttotal', 0) try: jsondata = json.loads(params.content) last = jsondata['data']['last'] hasnext = jsondata['data']['hasnext'] cmttotal = float(jsondata['data']['total']) NewsStorage.setcmtnum(params.originalurl, cmttotal) except: Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_COMMNETS) return temptimes = [] for comment in jsondata['data']['commentid']: tempcmttotal += 1 content = comment['content'] time = TimeUtility.getuniformtime(comment['time']) temptimes.append(time) user = comment['userinfo'].get('nick', 'anonymous') # 保存评论到数据库,可以通过接口exist判断评论是否已经存在 CMTStorage.storecmt(params.originalurl, content, time, user) #对是否继续提取评论进行条件限制 nextflag = True if temptimes: min_update = min(temptimes) max_update = max(temptimes) #发布时间临界点限制:最近两天 #if max_update < self.cmtlastdays: #nextflag = False #发布时间限制:仅针对qq的评论提取策略,该评论的发布时间有序且依次递减 if min_update < before_update: nextflag = False #数量限制 if tempcmttotal >= self.comment_maxnum: nextflag = False if float(tempcmttotal) / self.page_size > self.maxpages: nextflag = False if hasnext and nextflag: url = self.COMMENTS_URL.format(url_id, last, self.page_size) self.storeurl( url, params.originalurl, self.STEP_COMMENT_NEXT_PAGE, { 'url_id': url_id, 'comment_id': last, 'before_update': before_update, 'tempcmttotal': tempcmttotal }) except: Logger.printexception()
def process(self, params): try: if params.step is None: # 从url中获取拼接评论url的参数 oid = self.r.parse('^http://v\.ku6\.com/show/([\w-]+..).html', params.originalurl)[0] # 拼接第一页评论url comments_url = Ku6Comments.COMMENTS_URL % (oid, 1, 1) #通知下载平台,根据评论url获取第一页评论内容 self.storeurl(comments_url, params.originalurl, Ku6Comments.STEP_2, {'oid': oid}) #获取第一页评论内容,循环获取全部评论url elif params.step == Ku6Comments.STEP_2: oid = params.customized['oid'] # 获取评论的Jason返回值 comments = json.loads(params.content) # 获取评论总数 comments_count = float(comments['data']['count']) NewsStorage.setcmtnum(params.originalurl, int(comments['data']['count'])) if comments_count == 0: return # 比较上次抓取该url的页面评论量和当前取到的评论量 cmtnum = CMTStorage.getcmtnum(params.originalurl, True) if cmtnum >= comments_count: return # 循环拼接评论url,提交下载平台获取评论数据 for page in range( 0, int(math.ceil(comments_count / Ku6Comments.PAGE_SIZE)) + 1, 1): commentUrl = Ku6Comments.COMMENTS_URL % ( oid, Ku6Comments.PAGE_SIZE, page + 1) self.storeurl(commentUrl, params.originalurl, Ku6Comments.STEP_3, {'oid': oid}) #解析评论数据 elif params.step == Ku6Comments.STEP_3: commentsinfo = json.loads(params.content) if not commentsinfo['data']['list']: return for comment in commentsinfo['data']['list']: curtime = TimeUtility.getuniformtime( int(comment['commentCtime'])) content = comment['commentContent'] nick = comment['commentContent'] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) except Exception, e: Logger.printexception()
def step2(self, params): # Step2: 通过Step1设置url,得到评论的总数和最后一次评论时间,并根据评论总数得到获取其他评论的url。 content = params.content if "{" not in content: Logger.getlogging().debug('Get the data error') return jsonstr = content[content.index('{'):content.rindex('}') + 1] commentsinfo = json.loads(jsonstr) comments_count = int(commentsinfo['data']['misc']['count']) Logger.getlogging().debug('{url} comment: {ct}'.format( url=params.url, ct=comments_count)) NewsStorage.setcmtnum(params.originalurl, comments_count) # 增量检查 cmtnum = CMTStorage.getcount(params.originalurl, True) if cmtnum >= comments_count: return page_num = int(math.ceil( float(comments_count - cmtnum) / self.PERPAGE)) if page_num >= self.maxpages: page_num = self.maxpages # 拼出获取评论的URL并保存 if self.r.match(self.TYPE1, params.originalurl): movieid = params.customized['movieid'] for page in range(1, page_num + 1, 1): if page == 1: self.step3(params) continue comment_url = KanKanComments.COMMENTS_URL1.format( movieid=movieid, page=page, perpage=self.PERPAGE) self.storeurl(comment_url, params.originalurl, KanKanComments.STEP_3) elif self.r.match(self.TYPE2, params.originalurl): type = params.customized['type'] sid = params.customized['sid'] for page in range(1, page_num + 1, 1): if page == 1: self.step3(params) continue comment_url = KanKanComments.COMMENTS_URL2 % (type, sid, page, self.PERPAGE) self.storeurl(comment_url, params.originalurl, KanKanComments.STEP_3) elif self.r.match(self.TYPE3, params.originalurl): type = params.customized['type'] sid = params.customized['sid'] for page in range(1, page_num + 1, 1): if page == 1: self.step3(params) continue comment_url = KanKanComments.COMMENTS_URL2 % (type, sid, page, self.PERPAGE) self.storeurl(comment_url, params.originalurl, KanKanComments.STEP_3)
def process(self, params): try: if params.step is None: # 拼接第一页评论url comments_url = PcautoComments.COMMENTS_URL % (params.originalurl, 1, PcautoComments.PAGE_SIZE) #通知下载平台,根据评论url获取第一页评论内容 self.storeurl(comments_url, params.originalurl, PcautoComments.STEP_2) #获取第一页评论内容,循环获取全部评论url elif params.step == PcautoComments.STEP_2: # 获取评论的Jason返回值 comments = json.loads(params.content) # 获取评论页数 comments_count = int(comments['total']) NewsStorage.setcmtnum(params.originalurl, comments_count) if comments_count == 0: return # 判断增量 cmtnum = CMTStorage.getcount(params.originalurl, True) if cmtnum >= comments_count: return page_num = int(math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages # 循环拼接评论url,提交下载平台获取评论数据 for page in range(1, page_num + 1, 1): commentUrl = PcautoComments.COMMENTS_URL % (params.originalurl, page, PcautoComments.PAGE_SIZE) self.storeurl(commentUrl, params.originalurl, PcautoComments.STEP_3) #解析评论数据 elif params.step == PcautoComments.STEP_3: commentsinfo = json.loads(params.content) comments = [] for comment in commentsinfo['data']: updatetime = comment['createTime'] content = comment['content'] curtime = TimeUtility.getuniformtime(updatetime) try: nick = comment['nickName'] except: nick = 'anonymous' if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) # if URLStorage.storeupdatetime(params.originalurl, updatetime): # cmti = CommentInfo() # cmti.content = comment['content'] # comments.append(cmti) # if len(comments) > 0: # self.commentstorage.store(params.originalurl, comments) except Exception, e: traceback.print_exc()
def step2_news(self, params): objectid = params.customized['objectid'] channel = params.customized['channel'] type = params.customized['type'] clienttype = params.customized['clienttype'] key = params.customized['key'] pageno = params.customized['pageno'] content = params.content try: data = content[content.index('{'):content.rindex('}') + 1] except: return Logger.printexception() data = json.loads(data) datalist = data['list'] if not datalist: return timelist = [] for item in datalist: curtime = item['createTime'] content = item['content'] CMTStorage.storecmt(params.originalurl, content, curtime, '') timelist.append(TimeUtility.getuniformtime(curtime)) curcmtnum = data['cnum'] if pageno == 1: NewsStorage.setcmtnum(params.originalurl, curcmtnum) if not self.isnewesttime(params.originalurl, min(timelist)): return #dbcmtnum = CMTStorage.getcount(params.originalurl, True) #pages = int(math.ceil(float(curcmtnum - dbcmtnum) / self.news_pagesize)) pages = int(math.ceil(float(curcmtnum) / self.news_pagesize)) if pageno >= self.maxpages or pageno >= pages: return lastcmtid = data['list'][-1]['id'] pageno = pageno + 1 comment_url = self.new_commonurl.format(objectid=objectid, channel=channel, type=type, clienttype=clienttype, key=key, pageno=pageno, lastcmtid=lastcmtid) self.storeurl( comment_url, params.originalurl, self.STEP_COMMENT_EACH_PAGE, { 'objectid': objectid, 'channel': channel, 'type': type, 'clienttype': clienttype, 'key': key, 'pageno': pageno })