def step3news(self, params): Logger.getlogging().info("ZolbbsComments.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 xparser = XPathUtility(params.content) commentsinfo = xparser.getcomments( '//*[@class="comment-list-new"]//*[@class="commli"]/p') commentstime = xparser.getcomments( '//*[@class="comment-list-new"]//*[@class="published-time"]') commentsnick = xparser.getcomments( '//*[@class="comment-list-new"]//*[@class="user-name"]') # 获取评论,设置实际的评论量 for index in range(0, len(commentstime), 1): # 提取时间 tm = commentstime[index].strip() try: curtime = TimeUtility.getuniformtime(getuniformtime(tm), u'%Y-%m-%d %H:%M') except Exception, e: curtime = getuniformtime(tm) # 提取评论内容 content = commentsinfo[index] nick = commentsnick[index] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def getcomments(self, params): docid = params.customized['docid'] jsondata = json.loads(params.content) if len(jsondata['comments']) > 0: index = 0 for comment in jsondata['comments']: commentid = comment['comment_id'] content = comment['comment'] curtime = comment['createAt'] nick = comment['nickname'] if index == len(jsondata['comments']) - 1: last_comment_id = commentid if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) index += 1 comment_source_url = self.COMMENTS_URL.format( docid, self.page_size, last_comment_id) self.storeurl(comment_source_url, params.originalurl, self.STEP_COMMENT_FIRST_PAGE, { 'docid': docid, 'last_comment_id': last_comment_id }) else: return
def common_step3(self, proparam): # 网易非云阅读处理 try: commentsinfo = json.loads(proparam.content) except: Logger.getlogging().warning( '{url}:30000 No comments'.format(url=proparam.originalurl)) return #commentsinfo = json.loads(proparam.content) comments = [] # 获取评论 key_comments = 'comments' if key_comments in commentsinfo: for key in commentsinfo[key_comments].keys(): try: nickname = commentsinfo[key_comments][key]['user'][ 'nickname'] except: nickname = 'anonymous' if CMTStorage.exist( proparam.originalurl, commentsinfo[key_comments][key]['content'], commentsinfo[key_comments][key]['createTime'], nickname): CMTStorage.storecmt( proparam.originalurl, commentsinfo[key_comments][key]['content'], commentsinfo[key_comments][key]['createTime'], nickname) else: break
def step2(self, params): Logger.getlogging().info("MkzhanComments.STEP_2") comic_id = params.customized['comic_id'] # aboutid = params.customized['aboutid'] comments = json.loads(params.content) comments_count = int(comments['data']['count']) cmtnum = CMTStorage.getcount(params.originalurl, True) # 获取第一页的内容 for it in comments['data']['list']: content = it['content'] curtime = TimeUtility.getuniformtime(it['create_time']) nick = it['username'] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) # 设置cmtnum NewsStorage.setcmtnum(params.originalurl, comments_count) if cmtnum >= comments_count: Logger.getlogging().warning( '{url}:30000 No comments'.format(url=params.originalurl)) return page_num = int( math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages for page in range(2, page_num + 1, 1): comments_url = MkzhanComments.COMMENTS_URL % (comic_id, page, self.PAGE_SIZE) self.storeurl(comments_url, params.originalurl, MkzhanComments.STEP_3)
def geturlcomments(self, params): xparser = XPathUtility(params.content) # 取回所有评论 page = params.customized['page'] if page == 1: commentstimes = xparser.getcomments( '//table[position()>1]/tbody/tr/td/span[1]') commentscontents = xparser.getcomments( '//table[position()>1]/tbody/tr[2]/td[@class="post-main"]') commentsnicks = xparser.getcomments('//*[@class="name"]/a') else: commentstimes = xparser.getcomments('//table/tbody/tr/td/span[1]') commentscontents = xparser.getcomments( '//table/tbody/tr[2]/td[@class="post-main"]') commentsnicks = xparser.getcomments('//*[@class="name"]/a') # 设置实际的评论量 for index in range(0, len(commentscontents), 1): curtime = TimeUtility.getuniformtime(commentstimes[index][4:]) # 提取评论内容 content = commentscontents[index].strip() nick = commentsnicks[index].strip() if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def geturlcomments(self, proparam): # soup = BeautifulSoup(proparam.content, 'html5lib') # lis = soup.select('.comment-say') # for li in lis: # content = li.select_one('.des').get_text() # curtime = li.select_one('.time').get_text() # nick = li.select_one('.name replyName').get_text() # if not CMTStorage.exist(proparam.originalurl, content, curtime, nick): # CMTStorage.storecmt(proparam.originalurl, content, curtime, nick) # 取得评论的正则表达式 comments = re.findall(r'content":"(.+?)","paragraph_id"', proparam.content) commentsTime = self.r.parse( r'origin_created":"(\d+)","member_avatarPath"', proparam.content) nicks = self.r.parse(r'"nickname":"(.*?)","is_hot"', proparam.content) # 取得评论 index = 0 for comment in comments: comment = eval('u"' + comment + '"') content = comment.encode('utf-8') curtime = TimeUtility.getuniformtime(commentsTime[index]) nick = eval('u"' + nicks[index] + '"') nick = nick.encode('utf-8') if not CMTStorage.exist(proparam.originalurl, content, curtime, nick): CMTStorage.storecmt(proparam.originalurl, content, curtime, nick) index = index + 1
def bbs_step3(self, params): try: xparser = XPathUtility(params.content) page = params.customized['page'] pagecount = params.customized['pagecount'] comments = [] updatetimes = [] nicks = [] contents = xparser.getcomments('//*[@class="read"]') mid_times = xparser.getlist('//td[@class="authorname"]') for times in mid_times: updatetimes.append(self.r.parse(ur'于(\d+-\d+-\d+ \d+:\d+:\d+)留言', times)[0]) nicks.append(self.r.parse(ur'(.*)于', times)[0]) if page == 0: mid_index = 1 elif page > 0: mid_index = 0 comments_number = xparser.getnumber('//*[@id="msgsubject"]/font') if comments_number != 0: for index in range(mid_index, len(contents), 1): curtime = TimeUtility.getuniformtime(updatetimes[index]) content = contents[index] nick = nicks[index].split('于')[0].split('☆')[-1] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) except Exception, e: traceback.print_exc()
def step3bbs(self, params): Logger.getlogging().info("JoyComments.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 try: commentsinfo = json.loads(params.content) commentsinfo['result']['mainreplys']['rows'] except: Logger.getlogging().warning( '{url} Errorcode:40000'.format(url=params.originalurl)) Logger.printexception() return # 获取评论 for index in range( 0, int(len(commentsinfo['result']['mainreplys']['rows'])), 1): # 提取时间 # cmti = CommentInfo() content = commentsinfo['result']['mainreplys']['rows'][index][ 'reply']['reply']['body']['text'] curtime = TimeUtility.getuniformtime( str(commentsinfo['result']['mainreplys']['rows'][index] ['reply']['reply']['post_time'])) nick = commentsinfo['result']['mainreplys']['rows'][index][ 'reply']['user']['name'] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def getComments(self, params, url): # 当前评论页码 pg = self.r.parse(url, params.url)[0] soup = BeautifulSoup(params.content, 'html5lib') # 帖子内容 infos = soup.select('tr > td.postcontent') # 发表时间,内容格式[发表于 2016-10-7 18:04:25] comments = [] # 第一页的第一条内容为正文 if pg == '1': start = 1 else: start = 0 for info in infos[start:]: # 取主评论 if info.select_one('div[class="postmessage defaultpost"]'): content = info.select_one('div[class="postmessage defaultpost"]').get_text()\ .replace('\t','').replace('\n','').replace(' ','').strip() updatetime = info.select_one( 'div.postinfo > font').get_text().strip()[4:] + ':00' curtime = getuniformtime(updatetime) nick = 'none' if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) comments_couts = CMTStorage.getcount(params.originalurl) NewsStorage.setcmtnum(params.originalurl, comments_couts)
def step3_ebook(self, params): try: jsoncontent = json.loads(params.content) if not jsoncontent.has_key('data'): return html = jsoncontent['data']['listHtml'] if not html: return soup = BeautifulSoup(html, 'lxml') divs = soup.select('div.cf') if not divs: return for div in divs: # commentList > dl:nth-child(1) > div.cf > dd > p:nth-child(2) content = div.select('dd > p')[1].get_text() curtime = TimeUtility.getuniformtime( div.select('dd > p')[0].get_text().split('|')[-1]) nick = div.select('dd > p')[0].get_text().split('|')[0] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) except Exception, e: Logger.printexception()
def step3(self, params): # 获取评论的Jason返回值 commentsinfo = json.loads(params.content) for comment in commentsinfo: curtime = comment['created'] content = comment['contents'] CMTStorage.storecmt(params.originalurl, content, curtime, '')
def step3(self, params): comments = json.loads(params.content) for id in comments['data']['commentContentArr']: content = comments['data']['commentContentArr'][id]['content'] pubtime = comments['data']['commentContentArr'][id]['postDate'] nick = comments['data']['commentContentArr'][id]['userName'] CMTStorage.storecmt(params.originalurl, content, pubtime, nick)
def process(self, proparam): Logger.getlogging().info(proparam.url) try: if proparam.step is jiemianComments.STEP_1: # 取得url中的id articleId = re.findall(r'^http://www\.jiemian\.com/\w+/(\d+)', proparam.url).__getitem__(0) # 设置clicknum self.setclick(proparam) # 取得评论个数 comments_count = float(re.findall(r'"comment_count">(\d+)</span>', proparam.content).__getitem__(0)) if comments_count: NewsStorage.setcmtnum(proparam.originalurl, comments_count) # 取得评论件数 if int(comments_count) == 0: return # 增量判断 cmtnum = CMTStorage.getcount(proparam.originalurl, True) if cmtnum >= comments_count: return page_num = int(math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages # 循环取得评论的url for page in range(1, page_num + 1, 1): url = jiemianComments.COMMENTS_URL % (articleId, page) self.storeurl(url, proparam.originalurl, jiemianComments.STEP_3) elif proparam.step == jiemianComments.STEP_3: # proparam.content = proparam.content.replace('\\','') # soup = BeautifulSoup(proparam.content, 'html5lib') # items = soup.select('.comment-post') # for item in items: # content = item.select_one('.comment-main > p').get_text().encode('utf-8') # curtime = TimeUtility.getuniformtime(item.select_one('.date').get_text()) # nick = item.select_one('.author-name').get_text().decode('utf-8').encode('utf-8') # 取得点赞数 votenum = self.r.getid('ding', proparam.content) if votenum == '': Logger.getlogging().debug("Unable to get playcount") else: NewsStorage.setvotenum(proparam.originalurl, votenum) # 取得评论的正则表达式 comments = re.findall(r'<p>(.+?)<\\/p>', proparam.content) ctime = re.findall(r'<span class=\\"date\\">(.+?)<\\/span>',proparam.content) nicks = re.findall(r'class=\\"author-name\\">(.+?)<\\/a>', proparam.content) # 取得评论 for index in range(0,len(comments)): time = ctime[index].replace('\\', '') curtime = TimeUtility.getuniformtime(time) content = eval('u"' + comments[index] + '"').encode('utf-8') nick = eval('u"' + nicks[index] + '"').encode('utf-8') if not CMTStorage.exist(proparam.originalurl, content, curtime, nick): CMTStorage.storecmt(proparam.originalurl, content, curtime, nick) else: Logger.getlogging().error("proparam.step == %d", proparam.step) except Exception, e: traceback.print_exc()
def step3(self, params): """通过评论的url获取评论""" #相对之前的版本,本次更新变动: #comments存储的接口为CMTStorage.storecmt(),参数为originalurl, 评论内容, 评论发布时间, 用户 #存储的内容增加了 评论发布时间, 用户 try: jsondata = json.loads(params.content) if jsondata['comments']: for comment in jsondata['comments']: content = comment['content'] curtime = TimeUtility.getuniformtime( comment['create_time']) nick = comment['passport']['nickname'] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) reply = comment['comments'] while reply: for comment in comment['comments']: content = comment['content'] curtime = TimeUtility.getuniformtime( comment['create_time']) nick = comment['passport'].get( 'nickname', 'anonymous') if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) reply = comment['comments'] except: Logger.printexception() Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_SITE)
def dmzjnews_step3(self, params): params.content = params.content[params.content.index('['):params.content.rindex(']') + 1] commentsinfo = json.loads(params.content) for index in range(0, len(commentsinfo), 1): # 提取时间 content = commentsinfo[index]['content'] curtime = TimeUtility.getuniformtime(commentsinfo[index]['create_time']) CMTStorage.storecmt(params.originalurl, content, curtime, '')
def step3news(self, params): soup = BeautifulSoup(params.content, 'html5lib') divs = soup.select('.ncc_content') for div in divs: content = div.select_one('.ncc_content_right_text').get_text() curtime = div.select_one( '.ncc_content_right_title > dt').get_text() CMTStorage.storecmt(params.originalurl, content, curtime, '')
def process_book(self, params): try: if params.step == Comments.STEP_1: # 从url中获取拼接评论url的参数 bookId = self.r.parse('^http://www\.17k\.com/book/(\w+).html$', params.originalurl)[0] # 拼接第一页评论url comments_url = Comments.COMMENTS_URL % (bookId, 1, Comments.PAGE_SIZE) #通知下载平台,根据评论url获取第一页评论内容 self.storeurl(comments_url, params.originalurl, Comments.STEP_2, {'bookId': bookId}) #获取第一页评论内容,循环获取全部评论url elif params.step == Comments.STEP_2: bookId = params.customized['bookId'] # 获取评论的Jason返回值 comments = json.loads(params.content) comments_count = int(comments['page']['count']) # 判断增量 cmtnum = CMTStorage.getcount(params.originalurl) if cmtnum >= comments_count: return NewsStorage.setcmtnum(params.originalurl, comments_count) # 获取评论最后更新时间 lasttime = CMTStorage.getlastpublish(params.originalurl, True) # 获取评论页数 page_count = int(comments['page']['pagecount']) if page_count == 0: return if page_count >= self.maxpages: page_count = self.maxpages # 循环拼接评论url,提交下载平台获取评论数据 for page in range(1, page_count + 1, 1): commentUrl = Comments.COMMENTS_URL % (bookId, page, Comments.PAGE_SIZE) self.storeurl(commentUrl, params.originalurl, Comments.STEP_3, {'bookId': bookId}) #解析评论数据 elif params.step == Comments.STEP_3: commentsinfo = json.loads(params.content) for comment in commentsinfo['page']['result']: curtime = TimeUtility.getuniformtime( comment['creationDate']) content = comment['summary'] nick = comment['marks']['nikeName'] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) except Exception, e: traceback.print_exc()
def geturlcomemnts(self, params): Logger.getlogging().debug(params.originalurl) jsondata = json.loads(params.content) for it in jsondata['data']['comment_data']: content = Common.urldec(it['cms_body']).decode(CHARSET_UTF8) content = self.filterstr(content) Logger.getlogging().debug(content) if not CMTStorage.exist(params.originalurl, content, it['cms_pubdate'], it['uname']): CMTStorage.storecmt(params.originalurl, content, it['cms_pubdate'], it['uname'])
def step3(self, params): commentsinfo = json.loads(params.content) for comment in commentsinfo['data']['comment']: try: curtime = comment['time'] content = comment['content'] CMTStorage.storecmt(params.originalurl, content, curtime, '') except: Logger.printexception()
def step3(self, params): jsondata = json.loads(params.content) comments = jsondata['data']['listData'] for comment in comments: try: content = comment['content'] curtime = comment['createTime'] CMTStorage.storecmt(params.originalurl, content, curtime, '') except: Logger.printexception()
def step3(self, params): jsondata = json.loads(params.content) comments = [] for comment in jsondata['data']['comments']: try: curtime = int(comment['addTime']) content = comment['content'] CMTStorage.storecmt(params.originalurl, content, curtime, '') except: Logger.printexception()
def getcontents(self, params): soup = BeautifulSoup(params.content, 'html5lib') lis = soup.select('.all-post > .post-wrap') for li in lis: content = li.select_one('.post-body > a').get_text() curtime = li.select_one('.mr20').get_text() nick = li.select_one('.post-auther > a').get_text() if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def step3(self, params): soup = BeautifulSoup(params.content, 'html5lib') divs = soup.find_all(attrs={'id':re.compile('post_\d+')}) for div in divs: try: curtime = div.find(attrs={'class':'xg1 xw0'}).get_text() content = div.select_one('.t_f').get_text() CMTStorage.storecmt(params.originalurl, content, curtime, '') except: Logger.printexception()
def step3(self, params): jsdata = json.loads(params.content) listdata = jsdata['data']['listData'] for data in listdata: try: curtime = data['createTime'] content = data['content'] CMTStorage.storecmt(params.originalurl, content, curtime, '') except: Logger.printexception()
def getcomments(self, params): comments = json.loads(params.content) # 获取评论 for item in comments['data']: curtime = TimeUtility.getuniformtime(item['ctime']) content = item['content'] nick = item['user']['username'] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def step3(self, params): Logger.getlogging().info("MkzhanComments.STEP_3") comments = json.loads(params.content) for it in comments['data']['list']: content = it['content'] curtime = TimeUtility.getuniformtime(it['create_time']) nick = it['username'] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def process(self, params): try: if params.step is None: # 从url中获取拼接评论url的参数 oid = self.r.parse('^http://v\.ku6\.com/show/([\w-]+..).html', params.originalurl)[0] # 拼接第一页评论url comments_url = Ku6Comments.COMMENTS_URL % (oid, 1, 1) #通知下载平台,根据评论url获取第一页评论内容 self.storeurl(comments_url, params.originalurl, Ku6Comments.STEP_2, {'oid': oid}) #获取第一页评论内容,循环获取全部评论url elif params.step == Ku6Comments.STEP_2: oid = params.customized['oid'] # 获取评论的Jason返回值 comments = json.loads(params.content) # 获取评论总数 comments_count = float(comments['data']['count']) NewsStorage.setcmtnum(params.originalurl, int(comments['data']['count'])) if comments_count == 0: return # 比较上次抓取该url的页面评论量和当前取到的评论量 cmtnum = CMTStorage.getcmtnum(params.originalurl, True) if cmtnum >= comments_count: return # 循环拼接评论url,提交下载平台获取评论数据 for page in range( 0, int(math.ceil(comments_count / Ku6Comments.PAGE_SIZE)) + 1, 1): commentUrl = Ku6Comments.COMMENTS_URL % ( oid, Ku6Comments.PAGE_SIZE, page + 1) self.storeurl(commentUrl, params.originalurl, Ku6Comments.STEP_3, {'oid': oid}) #解析评论数据 elif params.step == Ku6Comments.STEP_3: commentsinfo = json.loads(params.content) if not commentsinfo['data']['list']: return for comment in commentsinfo['data']['list']: curtime = TimeUtility.getuniformtime( int(comment['commentCtime'])) content = comment['commentContent'] nick = comment['commentContent'] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) except Exception, e: Logger.printexception()
def process(self, params): try: if params.step is ThepaperComments.STEP_1: # 根据url获取拼接评论的参数 contid = params.originalurl.split('_') contid = contid[-1] # 拼接初始评论url comments_url = ThepaperComments.SOURCE_COMMENTS_URL.format( contid=contid) # 通知下载平台,根据评论url获取第一页评论内容 self.storeurl(comments_url, params.originalurl, ThepaperComments.STEP_2, {'contid': contid}) elif params.step == ThepaperComments.STEP_2: contid = params.customized['contid'] soup = BeautifulSoup(params.content, 'html5lib') divs = soup.find_all(attrs={ 'id': re.compile('comment'), 'class': 'comment_que' }) if not divs: return if self.r.search(ur'startId=(.*)', params.url): for index in range(1, len(divs), 1): tm = divs[index].select_one( '.aqwright > h3 > span').get_text() curtime = getuniformtime(tm) content = divs[index].select_one( '.aqwright > .ansright_cont > a').get_text() nick = divs[index].select_one( '.aqwright > h3 > a').get_text() if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) else: for index in range(0, len(divs), 1): tm = divs[index].select_one( '.aqwright > h3 > span').get_text() curtime = getuniformtime(tm) content = divs[index].select_one( '.aqwright > .ansright_cont > a').get_text() nick = divs[index].select_one( '.aqwright > h3 > a').get_text() if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) if self.r.search(ur'startId=(.*)', params.url): hotIds = params.customized['hotIds'] else:
def step2(self, params): """获取评论的其他url,及评论""" #每次spider运行的累加数据tempcmttotal # try: url_id = params.customized['url_id'] comment_id = params.customized['comment_id'] before_update = params.customized['before_update'] tempcmttotal = params.customized.get('tempcmttotal', 0) try: jsondata = json.loads(params.content) last = jsondata['data']['last'] hasnext = jsondata['data']['hasnext'] cmttotal = float(jsondata['data']['total']) NewsStorage.setcmtnum(params.originalurl, cmttotal) except: Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_COMMNETS) return temptimes = [] for comment in jsondata['data']['commentid']: tempcmttotal += 1 content = comment['content'] time = TimeUtility.getuniformtime(comment['time']) temptimes.append(time) user = comment['userinfo'].get('nick', 'anonymous') # 保存评论到数据库,可以通过接口exist判断评论是否已经存在 CMTStorage.storecmt(params.originalurl, content, time, user) #对是否继续提取评论进行条件限制 nextflag = True if temptimes: min_update = min(temptimes) max_update = max(temptimes) #发布时间临界点限制:最近两天 #if max_update < self.cmtlastdays: #nextflag = False #发布时间限制:仅针对qq的评论提取策略,该评论的发布时间有序且依次递减 if min_update < before_update: nextflag = False #数量限制 if tempcmttotal >= self.comment_maxnum: nextflag = False if float(tempcmttotal) / self.page_size > self.maxpages: nextflag = False if hasnext and nextflag: url = self.COMMENTS_URL.format(url_id, last, self.page_size) self.storeurl( url, params.originalurl, self.STEP_COMMENT_NEXT_PAGE, { 'url_id': url_id, 'comment_id': last, 'before_update': before_update, 'tempcmttotal': tempcmttotal }) except: Logger.printexception()
def step3(self, params): # Step3: 通过Step2设置的url,得到所有评论,抽取评论 commentsinfo = json.loads(params.content[2:-1]) contents = commentsinfo['data']['weibo'] for item in contents: curtime = TimeUtility.getuniformtime(item['pub_time']) content = item['content'] nick = str(item['userinfo']['nickname']) if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)