def step2(self, params): """""" try: key = params.customized['key'] soup = BeautifulSoup(params.content, 'html5lib') #print soup #searchListOne = soup.select('.searchListOne > ul') searchListOne = soup.select('.searchListOne > ul > li > div') if not searchListOne: Logger.getlogging().warning('{}:40000 No urllist'.format( params.originalurl)) return lis = soup.select( '.searchListOne > ul > li' )[:-1] #最后一个<li id=search_msg style="display:none"></li>,过滤掉 urllist = [] for li in lis: url = li.select_one('h3 > a').get('href') #print '*********',url tm = li.select('.source > span')[0].get_text() tm = getuniformtime(tm) now = getuniformtime(str(time.time())) cmt_num = li.select('.source > span')[-1].get_text() title = li.select_one('h3').get_text() if Common.checktitle(Common.urldec(key), title): if compareNow(tm, self.querylastdays): urllist.append(url) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA) except: #traceback.print_exc() Logger.printexception() Logger.getlogging().error( 'extract comment error from {site}'.format(site=params.url))
def step3news(self, params): Logger.getlogging().info("ZolbbsComments.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 xparser = XPathUtility(params.content) commentsinfo = xparser.getcomments( '//*[@class="comment-list-new"]//*[@class="commli"]/p') commentstime = xparser.getcomments( '//*[@class="comment-list-new"]//*[@class="published-time"]') commentsnick = xparser.getcomments( '//*[@class="comment-list-new"]//*[@class="user-name"]') # 获取评论,设置实际的评论量 for index in range(0, len(commentstime), 1): # 提取时间 tm = commentstime[index].strip() try: curtime = TimeUtility.getuniformtime(getuniformtime(tm), u'%Y-%m-%d %H:%M') except Exception, e: curtime = getuniformtime(tm) # 提取评论内容 content = commentsinfo[index] nick = commentsnick[index] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def process(self, params): try: if params.step is ThepaperComments.STEP_1: # 根据url获取拼接评论的参数 contid = params.originalurl.split('_') contid = contid[-1] # 拼接初始评论url comments_url = ThepaperComments.SOURCE_COMMENTS_URL.format( contid=contid) # 通知下载平台,根据评论url获取第一页评论内容 self.storeurl(comments_url, params.originalurl, ThepaperComments.STEP_2, {'contid': contid}) elif params.step == ThepaperComments.STEP_2: contid = params.customized['contid'] soup = BeautifulSoup(params.content, 'html5lib') divs = soup.find_all(attrs={ 'id': re.compile('comment'), 'class': 'comment_que' }) if not divs: return if self.r.search(ur'startId=(.*)', params.url): for index in range(1, len(divs), 1): tm = divs[index].select_one( '.aqwright > h3 > span').get_text() curtime = getuniformtime(tm) content = divs[index].select_one( '.aqwright > .ansright_cont > a').get_text() nick = divs[index].select_one( '.aqwright > h3 > a').get_text() if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) else: for index in range(0, len(divs), 1): tm = divs[index].select_one( '.aqwright > h3 > span').get_text() curtime = getuniformtime(tm) content = divs[index].select_one( '.aqwright > .ansright_cont > a').get_text() nick = divs[index].select_one( '.aqwright > h3 > a').get_text() if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) if self.r.search(ur'startId=(.*)', params.url): hotIds = params.customized['hotIds'] else:
def getComments(self, params, url): # 当前评论页码 pg = self.r.parse(url, params.url)[0] soup = BeautifulSoup(params.content, 'html5lib') # 帖子内容 infos = soup.select('tr > td.postcontent') # 发表时间,内容格式[发表于 2016-10-7 18:04:25] comments = [] # 第一页的第一条内容为正文 if pg == '1': start = 1 else: start = 0 for info in infos[start:]: # 取主评论 if info.select_one('div[class="postmessage defaultpost"]'): content = info.select_one('div[class="postmessage defaultpost"]').get_text()\ .replace('\t','').replace('\n','').replace(' ','').strip() updatetime = info.select_one( 'div.postinfo > font').get_text().strip()[4:] + ':00' curtime = getuniformtime(updatetime) nick = 'none' if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) comments_couts = CMTStorage.getcount(params.originalurl) NewsStorage.setcmtnum(params.originalurl, comments_couts)
def step3bbs(self, params): Logger.getlogging().info("Tmtpostcommnets.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 commentsinfo = json.loads(params.content) comments = [] #for index in range(0, int(len(commentsinfo['data'])), 1): ## 提取时间 #cmti = CommentInfo() #cmti.content = commentsinfo['data'][index]['comment'] #tm = TimeUtility.getuniformtime(commentsinfo['data'][index]['time_created'], u'%Y-%m-%d %H:%M') #if URLStorage.storeupdatetime(params.originalurl, tm): #comments.append(cmti) jsondata = commentsinfo['data'] if not jsondata: return for data in jsondata: cmti = CommentInfo() cmti.content = data['comment'] tm = gettimeutil.getuniformtime(data['time_created']) if URLStorage.storeupdatetime(params.originalurl, tm): comments.append(cmti) # 保存获取的评论 if len(comments) > 0: self.commentstorage.store(params.originalurl, comments)
def step2(self,params): soup = BeautifulSoup(params.content, 'html5lib') if soup.find(attrs={"id":re.compile('noresult_part._container')}): Logger.getlogging().warning('{url}:40000 No results'.format(url=params.url)) return results = soup.select('.results > .vrwrap') if not results: Logger.getlogging().warning('{url}:40000 No results'.format(url=params.url)) return urllist = [] newurllist = [] for item in results: try: if not item.select_one('h3.vrTitle > a'): continue title = item.select_one('h3.vrTitle > a').get_text() href = item.select_one('h3.vrTitle > a').get('href') timestr = item.select_one('.news-detail > .news-info > .news-from').get_text() times = getuniformtime(timestr) Logger.getlogging().debug('title:'+ title) Logger.getlogging().debug('time:'+ times) if compareNow(times, self.querylastdays): Logger.getlogging().debug('href:'+ href) urllist.append(href) newitem = item.select_one('#news_similar') if newitem: newhref = 'http://news.sogou.com/news'+newitem.get('href') Logger.getlogging().debug('newhref:'+ newhref) newurllist.append(newhref) except: Logger.printexception() if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_NEWS) if len(newurllist) > 0: self.__storeqeuryurllist__(newurllist, self.NEWS_EACH_2)
def geturlcomments(self, params): # 获取具体评论 xparser = XPathUtility(params.content) comments_xpath = xparser.xpath('//*[@id="short_comment_content"]') if not comments_xpath: return # 获取发布时间 ip_pubtimes_xpath = xparser.getlist('//*[@id="short_comment_left"]') if len(comments_xpath) == len(ip_pubtimes_xpath): comments = [] # 获取评论 for index in range(0, len(comments_xpath), 1): cmti = CommentInfo() publicTime = ip_pubtimes_xpath[index] if self.r.search(ur'\d{2}-\d+-\d+ \d+:\d+', publicTime): publicTime = '20' + self.r.parse(ur'\d{2}-\d+-\d+ \d+:\d+', publicTime)[0] if self.r.search(ur'\d+/\d+/\d+ \d+:\d+:\d+', publicTime): publicTime = self.r.parse(ur'\d+/\d+/\d+ \d+:\d+:\d+', publicTime)[0] if URLStorage.storeupdatetime(params.originalurl, getuniformtime(publicTime)): # 获取增加的评论(根据时间比较) cmti.content = comments_xpath[index].text comments.append(cmti)
def step2_2(self, params): """""" try: jsondata = json.loads(params.content) data = jsondata['data'] soup = BeautifulSoup(data, 'html5lib') divs = soup.select('.comment') except: Logger.getlogging().warning( '{url}:30000 No comments'.format(url=params.originalurl)) return #comments_total = len(divs) #cmtnum = URLStorage.getcmtnum(params.originalurl) #if cmtnum >= comments_total: #return #URLStorage.setcmtnum(params.originalurl, comments_total) comments = [] #divs.reverse() for div in divs: cmti = CommentInfo() cmti.content = div.find(attrs={ 'style': re.compile('padding-top') }).get_text().strip() tm = div.select_one('.show-time').get_text() tm = getuniformtime(tm) if not tm: continue if URLStorage.storeupdatetime(params.originalurl, tm): comments.append(cmti) if len(comments) > 0: # 保存获取的评论 self.commentstorage.store(params.originalurl, comments)
def step2(self, params): """""" q = params.customized['query'] soup = BeautifulSoup(params.content, 'html5lib') divs = soup.select('.videobox') if not divs: Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_COMMNETS) return urllist = [] for div in divs: title = div.select_one('.title').get_text() #print title tm = getuniformtime(div.select_one('.date').get_text()) url = div.select_one('.title > a').get('href') Logger.getlogging().debug(title) if not compareNow(tm, self.querylastdays): Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME) continue if not Common.checktitle(Common.urldec(q), title): Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTITLE) continue urllist.append(url) #获取最终url列表 if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def baidutiebasearch_step3(self, params): soup = BeautifulSoup(params.content, 'html5lib') post_list = soup.select('.s_post_list > .s_post') urllist = [] for item in post_list: try: title = item.select_one('.p_title > a').get_text().strip() href = item.select_one('.p_title > a').get('href') pubtimeobj = item.find(attrs={'class':'p_green p_date'}) if not pubtimeobj: Logger.log(params.url, constant.ERRORCODE_SITE_NOGET_XPATHVALUE) continue pubtime = pubtimeobj.get_text() pubtime = getuniformtime(pubtime) Logger.getlogging().debug(title) Logger.getlogging().debug(pubtime) if self.isyestoday(pubtime): Logger.getlogging().debug('https://tieba.baidu.com'+href) urllist.append('https://tieba.baidu.com'+href) else: Logger.log(params.url, constant.ERRORCODE_WARNNING_NOMATCHTIME) except: Logger.printexception() if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
def process(self, params): try: if params.step is ChinabyteComments.STEP_1: threadid = self.r.parse('data-thread-key=\"(.*?)\"', params.content) if not threadid: return comments_url = ChinabyteComments.COMMENTS_URL % (threadid[0], 1) self.storeurl(comments_url, params.originalurl, ChinabyteComments.STEP_2, { 'threadid': threadid[0], 'pageno': 1 }) elif params.step == ChinabyteComments.STEP_2: try: threadid = params.customized['threadid'] comments = json.loads(params.content) pagetotal = int(comments['cursor']['pages']) except: Logger.getlogging().warning('{0}:30000'.format( params.originalurl)) return #threadid = params.customized['threadid'] #comments = json.loads(params.content) #pagetotal= int(comments['cursor']['pages']) # pages==0的场合,没有评论 if pagetotal == 0: return for page in range(1, pagetotal + 1, 1): comments_url = ChinabyteComments.COMMENTS_URL % (threadid, page) self.storeurl(comments_url, params.originalurl, ChinabyteComments.STEP_3) # comments_url = ChinabyteComments.COMMENTS_URL % (params.customized['threadid'],params.customized['pageno']) # self.storeurl(comments_url, params.originalurl, ChinabyteComments.STEP_3, # {'threadid':params.customized['threadid'], # 'pageno':params.customized['pageno'], # 'totalpage':pagetotal}) # elif params.step == ChinabyteComments.STEP_3: comments = [] commentinfo = json.loads(params.content) for key in commentinfo['parentPosts'].keys(): updatetime = getuniformtime( commentinfo['parentPosts'][key]['created_at']) if URLStorage.storeupdatetime(params.originalurl, updatetime): cmti = CommentInfo() cmti.content = commentinfo['parentPosts'][key][ 'message'] comments.append(cmti) if len(comments) > 0: self.commentstorage.store(params.originalurl, comments) except: Logger.printexception()
def geturlcomments(self, params, startpos=0): # 取得所有评论 soup = BeautifulSoup(params.content, 'html5lib') comments = soup.select('.info') commentTimes = soup.select('.date') commentsInfo = [] # //*[contains(@id,"postmessage_")] if len(comments) <= 0: tds = soup.select( 'td.plc') # soup.find_all("td", attrs={"class": "plc"}) if tds is None: return for td in tds: timestr = td.find(attrs={'id': re.compile('authorposton')}) if not timestr: continue commentTimes = getuniformtime(timestr.get_text()) if URLStorage.storeupdatetime(params.originalurl, commentTimes): contents = td.find( attrs={'id': re.compile('postmessage_')}) if contents: cmti = CommentInfo() cmti.content = contents.get_text() commentsInfo.append(cmti) else: # 取得所有评论 for index in range(startpos, int(len(comments)), 1): # 提取时间 cmti = CommentInfo() publicTime = getuniformtime( commentTimes[index].get_text()).strip() #publicTime = self.r.parse(ur'发表于(.*)', publicTime)[0].strip() tm = TimeUtility.getuniformtime( TimeUtility.getuniformtime(publicTime, u'%Y-%m-%d %H:%M')) if URLStorage.storeupdatetime(params.originalurl, tm): cmti.content = comments[index].get_text() commentsInfo.append(cmti) if len(commentsInfo) > 0: self.commentstorage.store(params.originalurl, commentsInfo)
def step3(self, params): jsondata = json.loads(params.content) for comment in jsondata['data']: content = comment['content'] commentid = comment['id'] curtime = getuniformtime(comment['comment_time']) nick = "none" if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def process(self,params): if params.step == S2Query.STEP_1: html=etree.HTML(params.content) #try: #quit=html.xpath['//div[@id="results"]/text()'] #totalpage='0' #except: #totalpage=html.xpath('//div[@class="page"]/span/text()')[0] #totalpage= totalpage.split("/")[-1] #totalpage=re.sub("\D", "",totalpage) results = html.xpath('//*[@id="results"]') if not results: return totalpage=html.xpath('//*[@id="div_3"]/*[@class="page"]/span/text()') if totalpage: totalpage = self.r.parse('(\d+)',totalpage[0].split('/')[-1])[0] else: Logger.getlogging().info("there are no results you want!") return urllist=[] if int(totalpage) >= self.maxpages: totalpage = self.maxpages if totalpage <>'0': for pages in range(0,int(totalpage)): searchurl = S2Query.S2_URL % (pages+1,params.customized['key']) urllist.append(searchurl) self.__storeqeuryurllist__(urllist, S2Query.STEP_2,{'key':params.customized['key']}) else: return elif params.step == S2Query.STEP_2: comquerkey=Common.urldec(params.customized['key']).decode('gbk').encode('utf-8') soup = BeautifulSoup(params.content,'html5lib') urllist = [] divs = soup.find_all(attrs={'class':'result f s0'}) if not divs: return for div in divs: title = div.select_one('h3.c-title').get_text() title = ''.join(title.strip().split()) url_tm = div.select_one('.c-showurl').get_text() tm = getuniformtime(url_tm.split('/')[-1]) url = 'http://'+'/'.join(url_tm.split('/')[0:-1]) Logger.getlogging().debug(title) #Logger.getlogging().debug(url_tm) if not Common.checktitle(comquerkey, title): Logger.getlogging().warning('{url}:40000 out of range, the title!'.format(url=params.originalurl)) continue if not compareNow(tm, self.querylastdays): Logger.getlogging().warning('{url}:40000 out of range, the time!'.format(url=params.originalurl)) continue urllist.append(url) self.__storeurllist__(urllist,SPIDER_S2_WEBSITE_VIDEO)
def step2(self, params): """获取评论,和评论的评论url""" item = params.customized['item'] artId = params.customized['artId'] page = params.customized['page'] #step1.先定位到atl-main/atl-item,取其中的主评论正文,时间,replyid,子评论数 #step2.通过 子评论数/10 获取自评论页数pageNum #step3.通过 merNum,rellyid,pageNum拼出子评论id soup = BeautifulSoup(params.content, 'html5lib') alt_items = soup.select('.atl-main > .atl-item') #print 'alt_items:',len(alt_items) if page == 1: alt_items = alt_items[1:] for alt_item in alt_items: curtime = alt_item.select('.atl-head > div.atl-info > span') curtime = getuniformtime(curtime[-1].get_text()) main_comment = alt_item.select_one('.bbs-content').get_text() replyid = alt_item.select_one('a[class="reportme a-link"]').get( 'replyid') content = main_comment.strip() commentid = replyid nick = alt_item.select_one('a[class="js-vip-check"]').get_text() if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) child_comment_num = alt_item.select_one( 'a[class="a-link-2 ir-remark"]').get_text() if self.r.search('\d+', child_comment_num): child_comment_num = self.r.parse('\d+', child_comment_num)[0] else: child_comment_num = 0 continue pageNum = int(math.ceil(float(child_comment_num) / self.page_size)) for page in range(1, int(pageNum) + 1): child_url = self.COMMENTS_CHILD_URL.format(item=item, artId=artId, replyId=replyid, page=page) #print 'child_url:',child_url self.storeurl(child_url, params.originalurl, self.STEP_COMMENT_CHILD_PAGE, { 'item': item, 'artId': artId })
def getcontents(self,proparam): # 取得评论的正则表达式 comments = re.findall(r'<p>(.+?)<\\/p>', proparam.content) ctime = re.findall(r'<span class=\\"date\\">(.+?)<\\/span>', proparam.content) nicks = re.findall(r'class=\\"author-name\\">(.+?)<\\/a>', proparam.content) # 取得评论 index = 0 for index in range(0, len(comments)): time = getuniformtime(eval('u"' + ctime[index] + '"')) curtime = TimeUtility.getuniformtime(ctime) content = eval('u"' + comments[index] + '"').encode('utf-8') nick = eval('u"' + nicks[index] + '"').encode('utf-8') if not CMTStorage.exist(proparam.originalurl, content, curtime, nick): CMTStorage.storecmt(proparam.originalurl, content, curtime, nick)
def step3bbs(self, params): Logger.getlogging().info("Ea3wcomments.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 xparser = XPathUtility(params.content) commentsinfo = xparser.getcomments('//p[@class="comment-content"]') commentstime = xparser.getcomments('//span[@class="time"]') comments = [] # 获取评论 for index in range(0, int(len(commentsinfo)), 1): # 提取时间 cmti = CommentInfo() cmti.content = commentsinfo[index] if str(commentstime[index]).strip().decode("utf8") == '刚刚'.decode( "utf8"): tm = getuniformtime(str(datetime.datetime.now())) else: tm = getuniformtime(str(commentstime[index])) if URLStorage.storeupdatetime(params.originalurl, tm): comments.append(cmti) # 保存获取到的评论 if len(comments) > 0: self.commentstorage.store(params.originalurl, comments)
def step3(self,params): """""" soup = BeautifulSoup(params.content,'html5lib') items = soup.select('.commertItem') comments = [] for item in items: tm = item.select_one('.comment-time').get_text() updatetime = getuniformtime(tm) comment = item.select_one('.recTxt').get_text() if URLStorage.storeupdatetime(params.originalurl, updatetime): cmti = CommentInfo() cmti.content = comment comments.append(cmti) if len(comments) > 0: self.commentstorage.store(params.originalurl, comments)
def getanswers(self, params): """""" soup = BeautifulSoup(params.content, 'html5lib') answers = soup.select('.answer-wrapper > .answer-item') # comments = [] for answer in answers: tm = answer.select_one('.user').get_text() curtime = getuniformtime(tm) # lasttime = CMTStorage.getlastpublish(params.originalurl,True) # 通过评论最新时间来判断增量 # if curtime > lasttime: content = answer.select_one('.content').get_text() nick = "none" if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def step3(self, params): """获取新闻类的url列表""" key = Common.urldec(params.customized['query']) soup = BeautifulSoup(params.content, 'html5lib') lis = soup.select('.wzlist > ul > li.wztitle') if lis: urllist = [] for li in lis: title = li.select_one('a').get_text() # if key not in title: if not Common.checktitle(key, title): continue pubtime = li.select_one('span').get_text() url = 'http://www.52tian.net' + li.select_one('a').get('href') if compareNow(getuniformtime(pubtime), self.querylastdays): urllist.append(url) self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def step5bbs(self, params): # Step3: 通过Step2设置的url,得到所有评论,抽取评论 soup = BeautifulSoup(params.content, 'html.parser') commentsinfo = soup.select('.cpl_nrr2') commentstime = soup.select('.cpl_nrr1') comments = [] # 获取评论 for index in range(0, int(len(commentsinfo) - 1), 1): # 提取时间 cmti = CommentInfo() cmti.content = commentsinfo[index].get_text() publicTime = self.r.parse( ur'发表于 (.*)', commentstime[index].get_text().strip())[0] publicTime = getuniformtime(publicTime) if URLStorage.storeupdatetime(params.originalurl, publicTime): comments.append(cmti) # 保存获取到的评论 if len(comments) > 0: self.commentstorage.store(params.originalurl, comments)
def setpubtime(self, params): newtime = None if re.search('http://chanye\.18183\.com/.*', params.url): Xhtml = XPathUtility(params.content) timestr = Xhtml.getstring( '//*[@class="arc-other"]/span[3]|//*[@class="other"]/span[3]') if not timestr: return p = '(\d{2}-\d+-\d+)' if re.search(p, timestr): new = str(time.localtime()[0])[0:2] + re.findall(p, timestr)[0] newtime = getuniformtime(new) #if re.search('http://bbs\.18183\.com/.*',params.url): #Xhtml = XPathUtility(params.content) #timestr = Xhtml.getstring('//*[@class="authi"]/em') #if not timestr: #return #times = timestr.split(u'发表于')[1] #newtime = TimeUtility.getuniformtime(times) if newtime: NewsStorage.setpublishdate(params.originalurl, newtime)
def getkurlcomments(self, params): xparser = XPathUtility(params.content) # 获取评论列表 comments_xpath = xparser.xpath('//*[@class="page-pl-list-text"]') # 获取评论时间 pubtime_xpath = xparser.xpath('//*[@class="page-pl-user-timer"]') if len(comments_xpath) >= len(pubtime_xpath): start = len(comments_xpath) - len(pubtime_xpath) comments = [] for index in range(start, len(comments_xpath), 1): if URLStorage.storeupdatetime( params.originalurl, getuniformtime(pubtime_xpath[index].text)): cmti = CommentInfo() cmti.content = comments_xpath[index].text comments.append(cmti) # 保存获取到的评论 if len(comments) > 0: self.commentstorage.store(params.originalurl, comments)
def step2(self, params): try: soup = BeautifulSoup(params.content, 'html5lib') items = soup.select('.List > div > .List-item') if not items: return for item in items: times = item.select_one('.ContentItem-time').get_text() content = item.find( attrs={ "class": "RichText CopyrightRichText-richText" }).get_text() curtime = getuniformtime(times) nick = item.select_one( '.ContentItem-meta > .AnswerItem-meta > .AuthorInfo' ).get_text() if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) except: Logger.printexception()
def step2(self, params): """""" info = params.customized['query'] info = Common.urldec(info) soup = BeautifulSoup(params.content, 'html5lib') videos = soup.select('.uiVideo > .uiVideo__item') if videos: urllist = [] for video in videos: title = video.select_one('h3 > a').get('title') pubtime = video.select('.result__data > span')[-1].get_text() url = video.select_one('h3 > a').get('href') # if not info in title: if compareNow(getuniformtime(pubtime), self.querylastdays): if Common.checktitle(info, title): urllist.append(url) else: Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTITLE) else: Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME) self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def getsearchresult(self, params): info = params.customized['query'] soup = BeautifulSoup(params.content, 'html5lib') lis = soup.select('ul.ckl_cktpp > li.cfix') urllist = [] if lis: for li in lis: title = li.select_one('h3').get_text() # if info not in title: if not Common.checktitle(info, title): continue times = li.select('p')[-2].get_text() times = getuniformtime(times) url = li.select_one('h3 > a').get('href') if compareNow(times, self.querylastdays): urllist.append(url) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO) return len(urllist) else: return -1
def step2(self, params): """""" try: key = params.customized['key'] key = Common.urldec(key) soup = BeautifulSoup(params.content, 'html5lib') books = soup.select('#searchResult > .book') if books: urllist = [] for book in books: title = book.select_one('h3 > a').get_text() if key not in title: continue pubtime = book.select('.w_auth')[-2].get_text() url = book.select_one('h3 > a').get('href') if compareNow(getuniformtime(pubtime), self.querylastdays): urllist.append(url) self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_NEWS) except: Logger.printexception() Logger.getlogging().error( 'extract comment error from {site}'.format(site=params.url))
def geturlcomments(self, params): # 获取具体评论 xparser = XPathUtility(params.content) comments_xpath = xparser.xpath('//*[contains(@id, "cm_")]') if not comments_xpath: return # 获取发布时间 ip_pubtimes_xpath = xparser.getlist('//*[contains(@id,"CList___CommentList_UserLink_")]/..') if len(comments_xpath) == len(ip_pubtimes_xpath): comments = [] # 获取评论 for index in range(0, len(comments_xpath), 1): cmti = CommentInfo() if URLStorage.storeupdatetime(params.originalurl, getuniformtime(ip_pubtimes_xpath[index])): # 获取增加的评论(根据时间比较) cmti.content = comments_xpath[index].text comments.append(cmti) # 保存获取的评论 if len(comments) > 0: self.commentstorage.store(params.originalurl, comments)
def get_comment_reply_step3(self, params): """""" try: jsondata = json.loads(params.content) data = jsondata['data'] if data: comment_list = data['comment_list'] for comment_id in comment_list: try: comments = comment_list[comment_id] comment_list_num = comments['comment_list_num'] if int(comment_list_num) <= 0: continue for info in comments['comment_info']: curtime = getuniformtime(str(info['now_time'])) content = info['content'] if not CMTStorage.exist(params.originalurl, content, curtime, 'nick'): CMTStorage.storecmt(params.originalurl, content, curtime, 'nick') except: Logger.printexception() except: Logger.printexception()
def process(self, params): try: if params.step is AllComments.STEP_1: comments_url = AllComments.SID_URL % (params.originalurl) self.storeurl(comments_url, params.originalurl, AllComments.STEP_2) elif params.step is AllComments.STEP_2: try: threadid = self.r.parse('"sid":(.*?),"', params.content)[0] curcmtnum = int( self.r.parse('"postcount":(.*?),"', params.content)[0]) NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return pagenum = int(math.ceil(float(curcmtnum - dbcmtnum) / 3)) for page in range(0, pagenum, 1): comments_url = AllComments.COMMENTS_URL % (threadid, page) self.storeurl(comments_url, params.originalurl, AllComments.STEP_3) except: Logger.printexception() elif params.step is AllComments.STEP_3: # 获取评论的Jason返回值 commentsinfo = json.loads(params.content) comments = [] for comment in commentsinfo['data']: pubdate = getuniformtime(comment['time']) content = comment['cnt'] CMTStorage.storecmt(params.originalurl, content, pubdate, '') except: Logger.printexception()