def step3bbs(self, params): Logger.getlogging().info("Tmtpostcommnets.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 commentsinfo = json.loads(params.content) comments = [] #for index in range(0, int(len(commentsinfo['data'])), 1): ## 提取时间 #cmti = CommentInfo() #cmti.content = commentsinfo['data'][index]['comment'] #tm = TimeUtility.getuniformtime(commentsinfo['data'][index]['time_created'], u'%Y-%m-%d %H:%M') #if URLStorage.storeupdatetime(params.originalurl, tm): #comments.append(cmti) jsondata = commentsinfo['data'] if not jsondata: return for data in jsondata: cmti = CommentInfo() cmti.content = data['comment'] tm = gettimeutil.getuniformtime(data['time_created']) if URLStorage.storeupdatetime(params.originalurl, tm): comments.append(cmti) # 保存获取的评论 if len(comments) > 0: self.commentstorage.store(params.originalurl, comments)
def step2_2(self, params): """""" try: jsondata = json.loads(params.content) data = jsondata['data'] soup = BeautifulSoup(data, 'html5lib') divs = soup.select('.comment') except: Logger.getlogging().warning( '{url}:30000 No comments'.format(url=params.originalurl)) return #comments_total = len(divs) #cmtnum = URLStorage.getcmtnum(params.originalurl) #if cmtnum >= comments_total: #return #URLStorage.setcmtnum(params.originalurl, comments_total) comments = [] #divs.reverse() for div in divs: cmti = CommentInfo() cmti.content = div.find(attrs={ 'style': re.compile('padding-top') }).get_text().strip() tm = div.select_one('.show-time').get_text() tm = getuniformtime(tm) if not tm: continue if URLStorage.storeupdatetime(params.originalurl, tm): comments.append(cmti) if len(comments) > 0: # 保存获取的评论 self.commentstorage.store(params.originalurl, comments)
def step3(self, params): Logger.getlogging().info("Flash8Comments.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 page = params.customized['page'] xparser = XPathUtility(params.content) commentsinfo = xparser.getcomments('//td[@class="t_f"]') #commentstime = self.r.parse(ur'发表于 (\d+-\d+-\d+ \d+:\d+)</em>', params.content) commentstime = xparser.getcomments('//div[@class="authi"]/em') comments = [] # 获取评论 # 设置实际的评论量 if page is 1: statrIndex = 1 else: statrIndex = 0 for index in range(statrIndex, len(commentstime), 1): cmti = CommentInfo() if URLStorage.storeupdatetime(params.originalurl, commentstime[index]): # 获取增加的评论(根据时间比较) cmti.content = commentsinfo[index] comments.append(cmti) # 保存获取到的评论 if len(comments) > 0: self.commentstorage.store(params.originalurl, comments)
def step2(self, params): """""" print params.content try: jsondata = json.loads(params.content) comments_total = int(jsondata['comments_total']) comments_data = jsondata['comments'] except: Logger.getlogging().warning( '{url}:30000 No comments'.format(url=params.originalurl)) return #cmtnum = URLStorage.getcmtnum(params.originalurl) #if cmtnum >= comments_total: #return #URLStorage.setcmtnum(params.originalurl, comments_total) comments = [] for comment in comments_data: cmti = CommentInfo() cmti.content = comment['txtcontent'] tm = comment['addtime'] if URLStorage.storeupdatetime(params.originalurl, tm): comments.append(cmti) if len(comments) > 0: # 保存获取的评论 self.commentstorage.store(params.originalurl, comments) self.post_data['p'] = str(int(self.data['p'] + self.page_size)) self.post_data['t'] = TimeUtility.getuniformdate(tm, '%Y-%m-%d+%H%M%S') self.storeposturl(self.post_url, params.originalurl, self.STEP_2, self.post_data)
def process(self, params): try: if params.step is AllComments.STEP_1: key = int(re.findall("\d+", params.url.split("/")[-1])[0]) comments_url = AllComments.COMMENTS_URL % (key) self.storeurl(comments_url, params.originalurl, AllComments.STEP_2, {'key': key}) elif params.step is AllComments.STEP_2: jsoncontent = self.r.parse('data\((.*?)\)', params.content)[0] comments = json.loads(jsoncontent) pcontent = [] ptime = [] index = 0 for index in range(0, len(comments['comments'])): pcontent.append( comments['comments'][index]['comment_content']) ptime.append(comments['comments'][index]['comment_date']) dataresult = {} for i in range(len(pcontent)): dataresult[ptime[i]] = pcontent[i] comments = [] dataresult = sorted(dataresult.iteritems(), key=lambda dataresult: dataresult[0], reverse=True) for k in range(0, len(dataresult)): if URLStorage.storeupdatetime(params.originalurl, dataresult[k][0]): cmti = CommentInfo() cmti.content = dataresult[k][1] comments.append(cmti) self.commentstorage.store(params.originalurl, comments) except Exception, e: traceback.print_exc() Logger.getlogging().error(e.message)
def geturlcomments(self, params): # 获取具体评论 xparser = XPathUtility(params.content) comments_xpath = xparser.xpath('//*[@id="short_comment_content"]') if not comments_xpath: return # 获取发布时间 ip_pubtimes_xpath = xparser.getlist('//*[@id="short_comment_left"]') if len(comments_xpath) == len(ip_pubtimes_xpath): comments = [] # 获取评论 for index in range(0, len(comments_xpath), 1): cmti = CommentInfo() publicTime = ip_pubtimes_xpath[index] if self.r.search(ur'\d{2}-\d+-\d+ \d+:\d+', publicTime): publicTime = '20' + self.r.parse(ur'\d{2}-\d+-\d+ \d+:\d+', publicTime)[0] if self.r.search(ur'\d+/\d+/\d+ \d+:\d+:\d+', publicTime): publicTime = self.r.parse(ur'\d+/\d+/\d+ \d+:\d+:\d+', publicTime)[0] if URLStorage.storeupdatetime(params.originalurl, getuniformtime(publicTime)): # 获取增加的评论(根据时间比较) cmti.content = comments_xpath[index].text comments.append(cmti)
def step2(self, params): """获取评论的url""" try: newsId = params.customized['newsId'] jsondata = json.loads(params.content) backflag = False if jsondata: comments = [] for comment in jsondata: cmti = CommentInfo() if URLStorage.storeupdatetime(params.originalurl, str(comment['commentTime'])): cmti.content = comment['commentContent'] cmti.commentid = comment['commentId'] comments.append(cmti) else: backflag = True self.commentstorage.store(params.originalurl, comments) if backflag == False: self.commentstorage.store(params.originalurl, comments) self.pageno += 1 comment_url = self.COMMENTS_URL.format( self.pageno, self.page_size, newsId) self.storeurl(comment_url, params.originalurl, self.STEP_COMMENT_FIRST_PAGE, {'newsId': newsId}) except: Logger.printexception()
def step3(self, params): jsondata = json.loads(params.content) comments = [] for comment in jsondata: cmti = CommentInfo() curcomtime = int(comment['created']) # 检查是否需要更新当前抓取的评论的最新时间,第一条评论时间就是最新评论时间 if URLStorage.storeupdatetime( params.originalurl, TimeUtility.getuniformdate2(curcomtime)): cmti.content = comment['contents'] comments.append(cmti) # 检查是否有评论回复 if int(comment['comment_reply_total']) > 0: reply = comment['reply'] # 获取所有的评论回复 for num in range(0, int(comment['comment_reply_total']), 1): recmti = CommentInfo() recmti.content = reply[num]['contents'] comments.append(recmti) if len(comments) >= 0: # 保存获取的评论 self.commentstorage.store(params.originalurl, comments)
def process(self, params): try: if params.step is AllComments.STEP_1: try: threadid = self.r.parse('data-thread-key=\"(.*?)\"',params.content)[0]; comments_url = AllComments.COMMENTS_URL % (threadid, 1) self.storeurl(comments_url, params.originalurl, AllComments.STEP_2, {'threadid':threadid,'pageno':1}) except: return elif params.step is AllComments.STEP_2: try: comments = json.loads(params.content) pagetotal= int(comments['cursor']['pages']) comments_url = AllComments.COMMENTS_URL % (params.customized['threadid'],params.customized['pageno']) self.storeurl(comments_url, params.originalurl, AllComments.STEP_3, {'threadid':params.customized['threadid'], 'pageno':params.customized['pageno'], 'totalpage':pagetotal}) except: return elif params.step is AllComments.STEP_3: try: if params.customized['pageno']<=params.customized['totalpage']: comments = json.loads(params.content) roll=len(comments['response']) ptimer=[] pcontent=[] for key in comments['parentPosts'].keys(): ptime = comments['parentPosts'][key]['created_at'] ptime = ptime.split("+")[0] ptime = ptime.replace("T"," ") ptimer.append(datetime.datetime.strptime(ptime,'%Y-%m-%d %H:%M:%S')) pcontent.append(comments['parentPosts'][key]['message']) for ctime in range(0,len(ptimer)): ptimer[ctime]=datetime.datetime.strptime(str(ptimer[ctime]),'%Y-%m-%d %H:%M:%S') index=0 comments = [] complete = False for comment in pcontent: cmti = CommentInfo() cmti.content = comment if URLStorage.storeupdatetime(params.originalurl, str(ptimer[index])): comments.append(cmti) else: complete = True break; index =index+ 1 self.commentstorage.store(params.originalurl, comments) if not complete: comments_url = AllComments.COMMENTS_URL % (params.customized['threadid'], params.customized['pageno']+1) self.storeurl(comments_url, params.originalurl, AllComments.STEP_2, {'threadid':params.customized['threadid'], 'pageno':params.customized['pageno']+1, 'totalpage':params.customized['totalpage']}) except: return except Exception, e: traceback.print_exc() Logger.getlogging().error(e.message)
def process(self, params): try: if params.step is AllComments.STEP_1: aid = re.findall("\d+", params.url.split("/")[-1])[0] aid_url = AllComments.AID_URL % (aid) self.storeurl(aid_url, params.originalurl, AllComments.STEP_2, {'aid': aid}) elif params.step is AllComments.STEP_2: cms_id = re.findall('appidArr \= \[\"cms\|(.+?)",', str(params.content))[0] cms_url = AllComments.KEYID_URL % ( cms_id, params.customized['aid'], params.originalurl) self.storeurl(cms_url, params.originalurl, AllComments.STEP_3, { 'aid': params.customized['aid'], 'cmsid': cms_id }) elif params.step is AllComments.STEP_3: comments = json.loads(params.content) sid = comments['data']['_id'] comment_url = AllComments.COMMENTS_URL % ( sid, '1', params.customized['cmsid']) self.storeurl(comment_url, params.originalurl, AllComments.STEP_4, { 'sid': sid, 'page': '1', 'cmsid': params.customized['cmsid'] }) elif params.step is AllComments.STEP_4: comments = json.loads(params.content) try: comment = [] index = 0 for index in range(0, len(comments['data'])): ctime = TimeUtility.getuniformtime2( comments['data'][index]['ctime']) if URLStorage.storeupdatetime(params.originalurl, str(ctime)): cmti = CommentInfo() cmti.content = comments['data'][index]['content'] comment.append(cmti) self.commentstorage.store(params.originalurl, comment) comment_url = AllComments.COMMENTS_URL % ( params.customized['sid'], str(int(params.customized['page']) + 1), params.customized['cmsid']) self.storeurl( comment_url, params.originalurl, AllComments.STEP_4, { 'sid': params.customized['sid'], 'page': str(int(params.customized['page']) + 1), 'cmsid': params.customized['cmsid'] }) except: return except Exception, e: traceback.print_exc() Logger.getlogging().error(e.message)
def process(self, params): try: if params.step is ChinabyteComments.STEP_1: threadid = self.r.parse('data-thread-key=\"(.*?)\"', params.content) if not threadid: return comments_url = ChinabyteComments.COMMENTS_URL % (threadid[0], 1) self.storeurl(comments_url, params.originalurl, ChinabyteComments.STEP_2, { 'threadid': threadid[0], 'pageno': 1 }) elif params.step == ChinabyteComments.STEP_2: try: threadid = params.customized['threadid'] comments = json.loads(params.content) pagetotal = int(comments['cursor']['pages']) except: Logger.getlogging().warning('{0}:30000'.format( params.originalurl)) return #threadid = params.customized['threadid'] #comments = json.loads(params.content) #pagetotal= int(comments['cursor']['pages']) # pages==0的场合,没有评论 if pagetotal == 0: return for page in range(1, pagetotal + 1, 1): comments_url = ChinabyteComments.COMMENTS_URL % (threadid, page) self.storeurl(comments_url, params.originalurl, ChinabyteComments.STEP_3) # comments_url = ChinabyteComments.COMMENTS_URL % (params.customized['threadid'],params.customized['pageno']) # self.storeurl(comments_url, params.originalurl, ChinabyteComments.STEP_3, # {'threadid':params.customized['threadid'], # 'pageno':params.customized['pageno'], # 'totalpage':pagetotal}) # elif params.step == ChinabyteComments.STEP_3: comments = [] commentinfo = json.loads(params.content) for key in commentinfo['parentPosts'].keys(): updatetime = getuniformtime( commentinfo['parentPosts'][key]['created_at']) if URLStorage.storeupdatetime(params.originalurl, updatetime): cmti = CommentInfo() cmti.content = commentinfo['parentPosts'][key][ 'message'] comments.append(cmti) if len(comments) > 0: self.commentstorage.store(params.originalurl, comments) except: Logger.printexception()
def process(self, params): Logger.getlogging().info(params.url) try: if params.step is Dm123BbsComments.STEP_1: xparser = XPathUtility(params.content) #通过第一次传进来的URL判断是否有后续页面 keyvalue = self.r.parse('tid-(.*?).html', params.url)[0] pagecount = xparser.getnumber( '//*[@class="pages"]/div[@class="fl"]') commentinfo_url = params.url self.storeurl(commentinfo_url, params.originalurl, Dm123BbsComments.STEP_2, { 'keyvalue': keyvalue, 'totalpage': pagecount, 'curpage': 1 }) elif params.step == Dm123BbsComments.STEP_2: keyvalue = params.customized['keyvalue'] curpage = params.customized['curpage'] xparser = XPathUtility(params.content) commentsinfo = xparser.getcomments( '//div[contains(@class,"tpc_content")]') commentstime = self.r.parse(ur'\"(\d+-\d+-\d+ \d+:\d+)\">发表于:', params.content) comments = [] for index in range(0, len(commentstime)): cmti = CommentInfo() if URLStorage.storeupdatetime( params.originalurl, TimeUtility.getuniformtime(commentstime[0] + ':00')): # 获取增加的评论(根据时间比较) cmti.content = commentsinfo[index] comments.append(cmti) if len(comments) > 0: self.commentstorage.store(params.originalurl, comments) nextpageList = [keyvalue, "-page-", str(curpage + 1)] nextpage = '' nextpage = nextpage.join(nextpageList) if int(nextpageList[2]) <= int(params.customized['totalpage']): comment_url = Dm123BbsComments.COMMENT_URL.format( page=nextpage) self.storeurl( comment_url, params.originalurl, Dm123BbsComments.STEP_2, { 'keyvalue': nextpageList[0], 'totalpage': params.customized['totalpage'], 'curpage': curpage + 1 }) except Exception, e: traceback.print_exc()
def process(self, proparam): Logger.getlogging().info(proparam.url) try: if proparam.step is BaozouVideoComments.STEP_1: # Step1: 通过原始url得到moveid,得到获取评论的首页url。 Logger.getlogging().info("proparam.step is None") article_id = int(self.r.parse(r'^http://baozou\.com/\w+/(\d+).*', proparam.url)[0]) Logger.getlogging().debug(article_id) commentinfo_url = BaozouVideoComments.COMMENTS_URL % (article_id,1,self.per_page) self.storeurl(commentinfo_url, proparam.originalurl, BaozouVideoComments.STEP_2,{'article_id' : article_id}) elif proparam.step == BaozouVideoComments.STEP_2: # Step2: 通过Step1设置url,得到评论的总数和最后一次评论时间,并根据评论总数得到获取其他评论的url。 Logger.getlogging().info("proparam.step == 2") article_id = proparam.customized['article_id'] commentsinfo = json.loads(proparam.content) #print commentsinfo comments_count = int(commentsinfo['total_entries']) #print comments_count Logger.getlogging().debug('{url} comment: {ct}'.format(url = proparam.url, ct = comments_count)) #page = commentsinfo['total_pages'] #print page if comments_count == 0: return # 拼出获取评论的URL并保存 for page in range(1, int(math.ceil(float(comments_count) / self.per_page)) + 1, 1): comment_url = BaozouVideoComments.COMMENTS_URL % (article_id,page,self.per_page) self.storeurl(comment_url, proparam.originalurl, BaozouVideoComments.STEP_3) elif proparam.step == BaozouVideoComments.STEP_3: # Step3: 通过Step2设置的url,得到所有评论,抽取评论 Logger.getlogging().info("proparam.step == 3") commentsinfo = json.loads(proparam.content) contents = commentsinfo['comments'] commentsarr = [] for content in contents: cmti = CommentInfo() tm = TimeUtility.getuniformtime(content['created_at'], '%Y-%m-%d %H:%M:%S') if URLStorage.storeupdatetime(proparam.originalurl, tm): cmti.content = content['content'] commentsarr.append(cmti) # 保存获取的评论 if len(commentsarr) > 0: self.commentstorage.store(proparam.originalurl, commentsarr) else: Logger.getlogging().error('proparam.step == {step}'.format(step=proparam.step)) except Exception, e: traceback.print_exc()
def process(self, params): try: if params.step is None: # 根据html内容获取评论总数 xhtml = XPathUtility(html=params.content) countsStr = str( xhtml.getstring('//*[@id="chartForm"]/div[1]/a[3]')) startpos = countsStr.find('(') if startpos < 0: Logger.getlogging().error(params.originalurl) return comment_counts = int(countsStr[startpos + 1:countsStr.find(')')]) Logger.getlogging().debug(comment_counts) if comment_counts == 0: return # 比较上次抓取该url的页面评论量和当前取到的评论量 # # 循环拼接评论url,提交下载平台获取评论数据 for page in range( 1, int( math.ceil(comment_counts / Cine107Comments.PAGE_SIZE)) + 1, 1): commentUrl = Cine107Comments.COMMENTS_URL.format( url=params.originalurl, pageno=page) Logger.getlogging().debug(commentUrl) self.storeurl(commentUrl, params.originalurl, Cine107Comments.STEP_2) URLStorage.setcmtnum(params.originalurl, comment_counts) #解析评论数据 elif params.step == Cine107Comments.STEP_2: xhtml = XPathUtility(html=params.content) comments = [] contents = xhtml.getlist( '//*[@class="flow_commont_list clearfix"]/p') updatetimes = xhtml.getlist('//*/time') for index in range(0, len(contents), 1): udpatetime = TimeUtility.getuniformtime(updatetimes[index]) if URLStorage.storeupdatetime(params.originalurl, udpatetime): cmti = CommentInfo() Logger.getlogging().debug(contents[index]) cmti.content = str(contents[index]) comments.append(cmti) if len(comments) > 0: self.commentstorage.store(params.originalurl, comments) except: Logger.printexception()
def step3bbs(self, params): Logger.getlogging().info("BaozouNewsComments.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 commentsinfo = json.loads(params.content) comments = [] for index in range(0, int(len(commentsinfo['comments'])), 1): # 提取时间 cmti = CommentInfo() cmti.content = commentsinfo['comments'][index]['content'] tm = commentsinfo['comments'][index]['created_at'] if URLStorage.storeupdatetime(params.originalurl, tm): comments.append(cmti) # 保存获取的评论 if len(comments) > 0: self.commentstorage.store(params.originalurl, comments)
def step3(self, params): Logger.getlogging().info("ThirtysixKryptonComments.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 jsoncontent = json.loads(params.content) commentsInfo = [] for index in range(0, len(jsoncontent['data']['items']), 1): cmti = CommentInfo() # 提取评论内容 cmti.content = jsoncontent['data']['items'][index]['content'] # 提取时间 publicTime = jsoncontent['data']['items'][index]['created_at'] tm = TimeUtility.getuniformtime(TimeUtility.getuniformtime(publicTime, u'%Y-%m-%d %H:%M:%S')) if URLStorage.storeupdatetime(params.originalurl, tm): commentsInfo.append(cmti) if len(commentsInfo) > 0: # 保存获取的评论 self.commentstorage.store(params.originalurl, commentsInfo)
def step5bbs(self, params): # Step3: 通过Step2设置的url,得到所有评论,抽取评论 soup = BeautifulSoup(params.content, 'html.parser') commentsinfo = soup.select('.cpl_nrr2') commentstime = soup.select('.cpl_nrr1') comments = [] # 获取评论 for index in range(0, int(len(commentsinfo) - 1), 1): # 提取时间 cmti = CommentInfo() cmti.content = commentsinfo[index].get_text() publicTime = self.r.parse( ur'发表于 (.*)', commentstime[index].get_text().strip())[0] publicTime = getuniformtime(publicTime) if URLStorage.storeupdatetime(params.originalurl, publicTime): comments.append(cmti) # 保存获取到的评论 if len(comments) > 0: self.commentstorage.store(params.originalurl, comments)
def getkurlcomments(self, params): xparser = XPathUtility(params.content) # 获取评论列表 comments_xpath = xparser.xpath('//*[@class="page-pl-list-text"]') # 获取评论时间 pubtime_xpath = xparser.xpath('//*[@class="page-pl-user-timer"]') if len(comments_xpath) >= len(pubtime_xpath): start = len(comments_xpath) - len(pubtime_xpath) comments = [] for index in range(start, len(comments_xpath), 1): if URLStorage.storeupdatetime( params.originalurl, getuniformtime(pubtime_xpath[index].text)): cmti = CommentInfo() cmti.content = comments_xpath[index].text comments.append(cmti) # 保存获取到的评论 if len(comments) > 0: self.commentstorage.store(params.originalurl, comments)
def geturlcomments(self, params): # 获取具体评论 xparser = XPathUtility(params.content) comments_xpath = xparser.xpath('//*[contains(@id, "cm_")]') if not comments_xpath: return # 获取发布时间 ip_pubtimes_xpath = xparser.getlist('//*[contains(@id,"CList___CommentList_UserLink_")]/..') if len(comments_xpath) == len(ip_pubtimes_xpath): comments = [] # 获取评论 for index in range(0, len(comments_xpath), 1): cmti = CommentInfo() if URLStorage.storeupdatetime(params.originalurl, getuniformtime(ip_pubtimes_xpath[index])): # 获取增加的评论(根据时间比较) cmti.content = comments_xpath[index].text comments.append(cmti) # 保存获取的评论 if len(comments) > 0: self.commentstorage.store(params.originalurl, comments)
def step3(self, params): Logger.getlogging().info("Dm123NewsComments.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 is_only_one_page = params.customized['is_only_one_page'] if is_only_one_page: commentsinfos = params.customized['commentsinfos'] commentstimes = params.customized['commentstimes'] else: xparser = XPathUtility(params.content) commentsinfos = xparser.getcomments('//div[@class="rbvalueout"]') commentstimes = xparser.getcomments('//span[@class="rbtime"]') comments = [] for index in range(0, len(commentstimes)): commentstime = commentstimes[index].strip() if URLStorage.storeupdatetime(params.originalurl, commentstime): cmti = CommentInfo() cmti.content = commentsinfos[index].strip() comments.append(cmti) # 保存获取的评论 if len(comments) > 0: self.commentstorage.store(params.originalurl, comments)
def step3bbs(self, params): Logger.getlogging().info("Ea3wcomments.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 xparser = XPathUtility(params.content) commentsinfo = xparser.getcomments('//p[@class="comment-content"]') commentstime = xparser.getcomments('//span[@class="time"]') comments = [] # 获取评论 for index in range(0, int(len(commentsinfo)), 1): # 提取时间 cmti = CommentInfo() cmti.content = commentsinfo[index] if str(commentstime[index]).strip().decode("utf8") == '刚刚'.decode( "utf8"): tm = getuniformtime(str(datetime.datetime.now())) else: tm = getuniformtime(str(commentstime[index])) if URLStorage.storeupdatetime(params.originalurl, tm): comments.append(cmti) # 保存获取到的评论 if len(comments) > 0: self.commentstorage.store(params.originalurl, comments)
def step3bbs(self, params): Logger.getlogging().info("Chinavaluecomments.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 params.content = params.content[1:len(params.content) - 1] commentsinfo = json.loads(params.content) comments_count = commentsinfo['RecordCount'] # 判断增量 cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= comments_count: return URLStorage.setcmtnum(params.originalurl, comments_count) comments = [] for index in range(0,len(commentsinfo['CommentObjs'])): # 提取时间 cmti = CommentInfo() cmti.content = commentsinfo['CommentObjs'][index]['Content'] tm = TimeUtility.getuniformtime(TimeUtility.getuniformtime(commentsinfo['CommentObjs'][index]['AddTime'], u'%Y-%m-%d %H:%M')) if URLStorage.storeupdatetime(params.originalurl, tm): comments.append(cmti) # 保存获取的评论 if len(comments) > 0: self.commentstorage.store(params.originalurl, comments)
def process(self, params): Logger.getlogging().info(params.url) try: if params.step is Xie17NewsComments.STEP_1: #Step1: 通过得到docurl,得到获取评论的首页url参数。 articleId = self.r.parse('^http://xiaoshuo\.17xie\.com/book/(\d+)/', params.originalurl)[0] # 取得评论的url列表 comments_url = Xie17NewsComments.COMMENT_URL % (articleId, 1) self.storeurl(comments_url, params.originalurl, Xie17NewsComments.STEP_2, {'articleId': articleId}) elif params.step == Xie17NewsComments.STEP_2: # 获得评论参数 articleId = params.customized['articleId'] # 取得总件数 comment_count = float(self.r.parse(ur'共(\d+)人说过', params.content)[0]) if comment_count == 0: return # 判断增量 cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= comment_count: return URLStorage.setcmtnum(params.originalurl, comment_count) # 获取页数 page = int(math.ceil(comment_count / Xie17NewsComments.PAGE_SIZE)) # 获得url列表 for page in range(1, page + 1, 1): url = Xie17NewsComments.COMMENT_URL % (articleId, page) self.storeurl(url, params.originalurl, Xie17NewsComments.STEP_3) elif params.step == Xie17NewsComments.STEP_3: # Step3: 通过Step2设置的url,得到所有评论,抽取评论 Logger.getlogging().info("params.step == 3") xparser = XPathUtility(params.content) # 取得所有评论 comments = xparser.getcomments('/html/body/ul/li[2]/dl/dd') # 取得所有评论时间 commenttimes = xparser.xpath('/html/body/ul/li[2]/dl/dt/text()') commentsInfo = [] # 取得所有评论 for index in range(0, int(len(commenttimes)), 1): # 提取时间 if self.r.search(ur'\d+年\d+月',commenttimes[index].strip()): tm = TimeUtility.getuniformtime(str(commenttimes[index]).strip(), '%Y年%m月') else: tm = getuniformtime(commenttimes[index].strip()) if URLStorage.storeupdatetime(params.originalurl, tm): cmti = CommentInfo() comment = comments[index * 3] + comments[index * 3 + 1] + comments[index * 3 + 2] cmti.content = comment commentsInfo.append(cmti) # 保存获取的评论 if len(commentsInfo) > 0: self.commentstorage.store(params.originalurl, commentsInfo) else:
def process(self, proparam): Logger.getlogging().info(proparam.url) try: if proparam.step is bookComments.STEP_1: # 取得url中的id articleId = self.r.parse( r'^http://www\.2200book\.com/files/article/\w+/\d+/(\d+)\.htm$', proparam.originalurl).__getitem__(0) # 取得评论首页 url = bookComments.COMMENTS_URL % (articleId, 1) self.storeurl(url, proparam.originalurl, bookComments.STEP_2, {'articleId': articleId}) elif proparam.step == bookComments.STEP_2: articleId = proparam.customized['articleId'] # 取得评论页数 xparser = XPathUtility(proparam.content) page_count = int( self.r.parse( ur'>>(\d+)', xparser.getcomments("//*[@id='pagelink']")[0])[0]) # 取得评论的页数 if int(page_count) == 0: return # 取得评论的url for page in range(1, int(page_count) + 1, 1): url = bookComments.COMMENTS_URL % (articleId, page) self.storeurl(url, proparam.originalurl, bookComments.STEP_3) elif proparam.step == bookComments.STEP_3: rids = re.findall(r'rid=(\d+)">', proparam.content) for rid in rids: url = bookComments.COMMENTS_URL_RID % (rid) self.storeurl(url, proparam.originalurl, bookComments.STEP_4) elif proparam.step == bookComments.STEP_4: commentsInfo = [] # 论坛 xparser = XPathUtility(proparam.content) comments = xparser.getcomments( '//*[@id="sp_2"]/p[2]|//*[@id="b_v_5"]') commentTimes = self.r.parse(ur'发表于(:| )?(.+)(</p>|</div>)', proparam.content) for index in range(0, int(len(comments)), 1): if URLStorage.storeupdatetime(proparam.originalurl, commentTimes[index][1]): cmti = CommentInfo() cmti.content = comments[index] commentsInfo.append(cmti) # 保存获取的评论 if len(commentsInfo) > 0: self.commentstorage.store(proparam.originalurl, commentsInfo) else: Logger.getlogging().error("proparam.step == %d", proparam.step) except Exception, e: traceback.print_exc()
def process(self, params): Logger.getlogging().info(params.url) try: if params.step is HongXiuComments.STEP_1: #Step1: 通过得到docurl,得到获取评论的首页url参数。 bookId = self.r.parse('http://\w+\.hongxiu\.com/\w+/(\d+).*', params.originalurl)[0] # 取得评论的url列表 comments_url = HongXiuComments.COMMENT_URL.format( bookId=bookId, page=1) self.storeurl(comments_url, params.originalurl, HongXiuComments.STEP_2, {'bookId': bookId}) elif params.step == HongXiuComments.STEP_2: # 获得评论参数 bookId = params.customized['bookId'] # 取得总件数 params.content = (params.content).encode('utf-8') comment_count = self.r.parse( 'strong id="htmlrecordcnt" class="total">(\d+)</strong>条', params.content)[0] if comment_count == 0: return # 判断增量 cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= comment_count: return URLStorage.setcmtnum(params.originalurl, comment_count) # 获取页数 totalPage = int( math.ceil( float(comment_count) / HongXiuComments.PAGE_SIZE)) # 获得url列表 for page in range(1, totalPage + 1, 1): url = HongXiuComments.COMMENT_URL.format(bookId=bookId, page=page) self.storeurl(url, params.originalurl, HongXiuComments.STEP_3) elif params.step == HongXiuComments.STEP_3: # Step3: 通过Step2设置的url,得到所有评论,抽取评论 Logger.getlogging().info("params.step == 3") # 取得所有评论 soup = BeautifulSoup(params.content, 'html5lib') comments = soup.select('.inner') # 取得所有评论时间 commenttimes = soup.select('.postTime') commentsInfo = [] # 取得所有评论 for index in range(0, int(len(comments)), 1): # 提取时间 publicTime = self.r.parse( ur'(.*) 发表', commenttimes[index].get_text())[0] tm = getuniformtime(publicTime) if URLStorage.storeupdatetime(params.originalurl, tm): cmti = CommentInfo() cmti.content = self.r.parse( ur'发表([\s\S]*)', comments[index].get_text().strip())[0] commentsInfo.append(cmti) # 保存获取的评论 if len(commentsInfo) > 0: self.commentstorage.store(params.originalurl, commentsInfo) else: Logger.getlogging().error( 'proparam.step == {step}'.format(step=params.step)) except Exception, e: traceback.print_exc()
def process(self, params): Logger.getlogging().info(params.url) try: if params.step is SeventeenKComments.STEP_1: #Step1: 通过得到docurl,得到获取评论的首页url。 #Logger.getlogging().info("proparam.step is None") # 在视频url中取出docurl,^http://v\.ifeng\.com\/\w+\/\w+/\d{6}\/[0-9a-z-]+\.shtml # 取URL中的([0-9a-z-]+)参数,此参数为docurl docurl = self.r.parse( '^http://bbs\.17k\.com\/thread-(\d+)-\d+-1\.html', params.originalurl)[0] #Logger.getlogging().debug(docurl) # 评论首页URL为http://comment.ifeng.com/getv.php?job=1&docurl=([0-9a-z-]+)&p=1 commentinfo_url = 'http://bbs.17k.com/thread-{docurl}-1-1.html'.format( docurl=docurl) self.storeurl(commentinfo_url, params.originalurl, SeventeenKComments.STEP_2, {'docurl': docurl}) elif params.step == SeventeenKComments.STEP_2: #将STEP_1中的docurl传下来 docurl = params.customized['docurl'] # Step2: 通过Step1设置url,得到评论的总数,并根据评论总数得到获取其他评论的url。 #Logger.getlogging().info("params.step == 2") # 打开STEP1中URL,截取"count":num字段,取出num的值,num字段为评论总数 xparser = XPathUtility(params.content) commentsinfo = xparser.getnumber( '//*[@class="hm ptn"]/span[5]') #Logger.getlogging().debug(comments_count / self.page_size) #Logger.getlogging().debug(math.ceil(comments_count / self.page_size)) # 保存页面评论量 cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= int(commentsinfo): return URLStorage.setcmtnum(params.originalurl, int(commentsinfo)) # 总数除以page_size,然后加1,可得到评论总页数comments_count # 循环http://comment.ifeng.com/getv.php?job=1&docurl=([0-9a-z-]+)&p=comments_count,从一开始循环到上一步操作取到的数值,从而得到所有评论的URL,并保存 pagecount = xparser.getnumber('//*[@class="pg"]/label/span') for page in range(1, pagecount + 1, 1): comment_url = SeventeenKComments.COMMENT_URL.format( docurl=docurl, page=page) self.storeurl(comment_url, params.originalurl, SeventeenKComments.STEP_3, {'page': page}) elif params.step == SeventeenKComments.STEP_3: # Step3: 通过Step2设置的url,得到所有评论,抽取评论 #Logger.getlogging().info("params.step == 3") page = params.customized['page'] xparser = XPathUtility(params.content) commentsinfo = xparser.getcomments( '//*[contains(@id,"postmessage")]') commentstime = self.r.parse(ur'发表于 (\d+-\d+-\d+ \d+:\d+)</em>', params.content) comments = [] #获取评论 # 设置实际的评论量 if page is 1: statrIndex = 1 else: statrIndex = 0 for index in range(statrIndex, len(commentstime), 1): cmti = CommentInfo() if URLStorage.storeupdatetime(params.originalurl, commentstime[index] + ':00'): # 获取增加的评论(根据时间比较) cmti.content = commentsinfo[index] comments.append(cmti) # 保存获取到的评论 if len(comments) > 0: self.commentstorage.store(params.originalurl, comments) else: Logger.getlogging().error( 'proparam.step == {step}'.format(step=params.step)) except Exception, e: traceback.print_exc()
def process(self, params): Logger.getlogging().info(params.url) try: if params.step is Rain8Comments.STEP_1: #Step1: 通过得到docurl,得到获取评论的首页url参数。 articleId = self.r.parse('http://\w+\.tadu\.com/\w+/(\d+).*', params.originalurl)[0] # 取得评论的url列表 comments_url = Rain8Comments.COMMENT_URL.format (articleId = articleId,page = 1) self.storeurl(comments_url, params.originalurl, Rain8Comments.STEP_2, {'articleId': articleId}) elif params.step == Rain8Comments.STEP_2: # 获得评论参数 articleId = params.customized['articleId'] # 取得总件数 #comment_count = float(self.r.getid('total', params.content)) xparser = XPathUtility(params.content) countstr = xparser.getstring('//h4') if self.r.search(u'\d+', countstr): comment_count = self.r.parse(u'(\d+)', countstr)[1] if comment_count == 0: return # 判断增量 cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= comment_count: return URLStorage.setcmtnum(params.originalurl, comment_count) # 获取页数 totalPage = int(math.ceil(float(comment_count) / TaDuComments.PAGE_SIZE)) # 获得url列表 for page in range(1, totalPage+1 , 1): url = TaDuComments.COMMENT_URL.format(articleId = articleId,page = page) self.storeurl(url, params.originalurl, TaDuComments.STEP_3) elif params.step == TaDuComments.STEP_3: # Step3: 通过Step2设置的url,得到所有评论,抽取评论 Logger.getlogging().info("params.step == 3") # 取得所有评论 xparser = XPathUtility(params.content) comments = xparser.getlist('//ul[@class="cmetlist bookreview-cmetlist"]/li/div/div[2]/p') # 取得所有评论时间 commenttimes = xparser.getlist('//ul[@class="cmetlist bookreview-cmetlist"]/li/div/div[2]/span') commentsInfo = [] # 取得所有评论 for index in range(0, int(len(comments)), 1): # 提取时间 publicTime = commenttimes[index][3:] cmti = CommentInfo() tm = TimeUtility.getuniformtime(publicTime,'%Y-%m-%d %H:%M') if URLStorage.storeupdatetime(params.originalurl, tm): cmti.content = comments[index].strip() commentsInfo.append(cmti) # 保存获取的评论 if len(commentsInfo) > 0: self.commentstorage.store(params.originalurl, commentsInfo) else: Logger.getlogging().error('proparam.step == {step}'.format(step = params.step)) except Exception,e: traceback.print_exc()
def process(self, params): Logger.getlogging().info(params.url) try: if params.step is zhulangComments.STEP_1: #Step1: 通过得到docurl,得到获取评论的首页url参数。 articleId = self.r.parse('http://www.zhulang.com/(\d+)/', params.originalurl)[0] # 取得评论的url列表 comments_url = zhulangComments.COMMENT_URL % (articleId, 1) self.storeurl(comments_url, params.originalurl, zhulangComments.STEP_2, {'articleId': articleId}) elif params.step == zhulangComments.STEP_2: # 获得评论参数 articleId = params.customized['articleId'] # 取得总件数 comment_count = float(self.r.getid('total', params.content)) if comment_count == 0: return # 判断增量 cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= comment_count: return URLStorage.setcmtnum(params.originalurl, comment_count) # 获取页数 page = int(math.ceil(comment_count / zhulangComments.PAGE_SIZE)) # 获得url列表 for page in range(0, page, 1): url = zhulangComments.COMMENT_URL % (articleId, page) self.storeurl(url, params.originalurl, zhulangComments.STEP_3) elif params.step == zhulangComments.STEP_3: # Step3: 通过Step2设置的url,得到所有评论,抽取评论 Logger.getlogging().info("params.step == 3") # 取得所有评论 comments = self.r.parse(r'<p class=\\"cmt-txt\\">(.+?)<\\/p>', params.content) # 取得所有评论时间 commenttimes = self.r.parse( r'<span class=\\"cmt-time\\">(.+?)<\\/span>', params.content) index = 0 commentsInfo = [] # 取得所有评论 for index in range(index, int(len(comments)), 1): # 提取时间 publicTime = commenttimes[index] if URLStorage.storeupdatetime(params.originalurl, publicTime): cmti = CommentInfo() x = json.loads('{"comment":"%s"}' % comments[index].encode('utf8')) cmti.content = (x['comment']) commentsInfo.append(cmti) # 保存获取的评论 if len(commentsInfo) > 0: self.commentstorage.store(params.originalurl, commentsInfo) else: Logger.getlogging().error( 'proparam.step == {step}'.format(step=params.step)) except Exception, e: traceback.print_exc()