def getclick(self, params): pattern = 'https?://\w+\.le\.com.*/\w+/(\d+)\.html' if re.search(pattern, params.originalurl): if self.r.search(pattern, params.originalurl): vid = self.r.parse(pattern, params.originalurl)[0] playcount_url = self.PALYCOUNT_URL.format(vid=vid) self.storeurl(playcount_url, params.originalurl, LeComments.STEP_PALY) if NewsStorage.getpublishdate( params.originalurl) == TimeUtility.getintformtime(0): if self.r.search('https?://sports\.le\.com/video/\d+\.html', params.originalurl): #仅针对体育频道获取发布时间 pubTime = XPathUtility( params.content).getstring('//*[@class="live-vedio-infor"]') publishdate = TimeUtility.getuniformtime(publishdate) NewsStorage.setpublishdate(params.originalurl, publishdate) else: #仅针对综艺频道获取发布时间 title = XPathUtility(params.content).getstring( '//h1[@class="j-video-name video-name"]') if title: if re.search('\d{8}', title): publishdate = re.findall('\d{8}', title)[0] NewsStorage.setpublishdate(params.originalurl, publishdate)
def process(self, params): if params.step == IfengS2Query.IFENG_S2QUERY_FIRST_PAGE: q = params.customized['query'] # html = etree.HTML(params.content) xparser = XPathUtility(params.content) mid_count = xparser.getnumber('//div[@class="serpinfo"]/span/em') count = str(mid_count).strip() querylist = [] # 获取不到,则返回 if count == 0: return elif count > 0: pagenum = int( math.ceil(float(count) / IfengS2Query.DEFAULT_PAGE_SIZE)) if pagenum >= self.maxpages: pagenum = self.maxpages for page in range(1, pagenum + 1, 1): url = IfengS2Query.IFENG_QUERY_TEMPLATE.format(pn=page, q=q) querylist.append(url) self.__storeqeuryurllist__( querylist, IfengS2Query.IFENG_S2QUERY_EACH_PAGE, {'info': q}) elif params.step == IfengS2Query.IFENG_S2QUERY_EACH_PAGE: self.step2(params)
def getpagecomments(self, params): info = params.customized['query'] xpath = XPathUtility(html=params.content) hrefs = xpath.xpath('//*[@class="sosResult"]/strong/a/@href') titles = xpath.getlist('//*[@class="sosResult"]/strong/a') pubtimes = xpath.xpath('//*[@class="sosResult"]/span/cite[3]') today = datetime.datetime.strptime( TimeUtility.getcurrentdate(), TimeUtility.DATE_FORMAT_DEFAULT).date() urllist = [] for index in range(0, len(titles), 1): # 标题中包含指定要查询的关键字 # if titles[index].find(info) > -1: if Common.checktitle(info, titles[index]): pubtimestr = TimeUtility.getuniformtime( pubtimes[index].text).split(' ')[0] pubtime = datetime.datetime.strptime( pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT).date() # pubtime = datetime.datetime.strptime(pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT) inteveral = today - pubtime # 时间在指定周期内 if inteveral.days <= int(self.querylastdays): newurl = self.preprocess(hrefs[index]) if newurl is not None: urllist.append(newurl) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
def step2(self, params): Logger.getlogging().info("LaohuComments.STEP_2") token = params.customized['token'] sourceId = params.customized['sourceId'] xhtml = XPathUtility(html=params.content) # 网友评论(32) countstr = xhtml.getlist('//*[@class="filter-by-type"]')[0] comment_counts = int(countstr[5:countstr.__len__()-1]) if comment_counts: NewsStorage.setcmtnum(params.originalurl, comment_counts) if comment_counts == 0: Logger.getlogging().warning('{url}:30000 No comments'.format(url=params.originalurl)) return cmtnum = CMTStorage.getcount(params.originalurl, True) # 判断增量 if cmtnum >= comment_counts: #Logger.getlogging().warning('{url}:30000 No comments'.format(url=params.originalurl)) return page_num = int(math.ceil(float(comment_counts - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages # 获取第一页评论内容 self.getComments(params) if comment_counts > 15: # 循环拼接评论url,提交下载平台获取评论数据 COMMENTS_URL = 'http://member.laohu.com/comment/ajax?page=%d&token=%s&order=new' for page in range(2, page_num + 1, 1): commentUrl = LaohuComments.COMMENTS_URL % (page, sourceId) self.storeurl(commentUrl, params.originalurl, LaohuComments.STEP_3, {'token' : token, 'sourceId':sourceId})
def baidutiebasearch_step2(self, params): # Step2: 根据返回内容,通过xpath: //*[@class="nums"] 得到最大总条数 # 获取第一页的搜索结果 self.baidutiebasearch_step3(params) # 获取尾页page数 xparser = XPathUtility(html=params.content) pager_search = xparser.xpath('//*[@class="pager pager-search"]') queryurl = '' if pager_search: tailpageurl = xparser.xpath('//*[@class="pager pager-search"]/a[last()]/@href') try: if tailpageurl: lists = tailpageurl[0].split('pn=') queryurl = 'http://tieba.baidu.com'+lists[0] tailpage = int(lists[1]) if tailpage > BaiduTiebaS2Query2.DEFAULT_MAX_PAGESIZE: tailpage = BaiduTiebaS2Query2.DEFAULT_MAX_PAGESIZE if tailpage > self.maxpages: tailpage = self.maxpages else: tailpage = 1 except: tailpage = 1 else: # 没有检索结果,直接返回 Logger.log(params.url, constant.ERRORCODE_EXCEPTTION_JSON) return if not queryurl: return # 根据上面的tailpage数,拼出除了第一页之外的所有的搜索结果url querylist = [] for page in range(2, tailpage + 1, 1): url = queryurl + 'pn={page}'.format(page=page) querylist.append(url) self.__storeqeuryurllist__(querylist, BaiduTiebaS2Query2.BAIDU_TIEBA_SEARCH_EACH_PAGE)
def step1(self, params): # 获得首页url参数 info = params.customized['query'] xparser = XPathUtility(params.content) if not xparser.xpath('//*[@class="mytopic topiclisttr"]'): Logger.log(params.url, constant.ERRORCODE_WARNNING_NORESULTS) return pageList = xparser.getcomments('//span[@class="right"]/a') if len(pageList) == 1: pageTotal = 1 else: pageTotal = pageList[len(pageList) - 2] if int(pageTotal) >= self.maxpages: pageTotal = self.maxpages # 所有循环列表 querylist = [] # 根据总页数,获取query列表 for page in range(1, int(pageTotal) + 1, 1): if page == 1: self.step2(params) continue url = hupuS2Query.HUPU_QUERY_TEMPLATE.format(q=info, pn=page) querylist.append(url) self.__storeqeuryurllist__(querylist, hupuS2Query.HUPU_S2QUERY_EACH_PAGE, {'query': info})
def step1(self, params): key = params.customized['key'] srchfrom = params.customized['srchfrom'] xpath = XPathUtility(params.content) text = xpath.getstring('//*[@id="main"]/span') tstr = u'搜索总条数' if not self.r.search(tstr, text): Logger return num = self.r.parse('\d+', text)[0] pages = int(math.ceil(float(num) / self.page_size)) if pages >= self.maxpages: pages = self.maxpages querylist = [] for page in range(1, pages + 1): if page == 1: self.step2(params) continue url = TGbusS2Query.TGBUS_QUERY_TEMPLATE.format(key=key, page=page, srchfrom=srchfrom) querylist.append(url) if querylist: self.__storeqeuryurllist__(querylist, TGbusS2Query.TGBUS_S2QUERY_EACH_PAGE, {'key': key})
def step1(self, params): # 取得url中的id articleId = self.r.parse(r'^https://movie\.douban\.com/\w+/(\d+)', params.url)[0] # 取得评论件数 xpathobj = XPathUtility(params.content) text = xpathobj.getstring( xpath='//*[@id="comments-section"]//h2/*[@class="pl"]/a') numtext = self.r.parse('\d+', text) if not numtext: return curcmtnum = float(numtext[0]) NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return # 循环取得评论的url pages = int(math.ceil(float(curcmtnum - dbcmtnum) / self.PAGE_SIZE)) if pages >= self.maxpages: pages = self.maxpages for page in range(1, pages + 1, 1): url = doubanComments.COMMENTS_URL.format(articleId=articleId, start=(page - 1) * self.PAGE_SIZE, pagesize=self.PAGE_SIZE) self.storeurl(url, params.originalurl, doubanComments.STEP_2)
def step3news(self, params): Logger.getlogging().info("ZolbbsComments.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 xparser = XPathUtility(params.content) commentsinfo = xparser.getcomments( '//*[@class="comment-list-new"]//*[@class="commli"]/p') commentstime = xparser.getcomments( '//*[@class="comment-list-new"]//*[@class="published-time"]') commentsnick = xparser.getcomments( '//*[@class="comment-list-new"]//*[@class="user-name"]') # 获取评论,设置实际的评论量 for index in range(0, len(commentstime), 1): # 提取时间 tm = commentstime[index].strip() try: curtime = TimeUtility.getuniformtime(getuniformtime(tm), u'%Y-%m-%d %H:%M') except Exception, e: curtime = getuniformtime(tm) # 提取评论内容 content = commentsinfo[index] nick = commentsnick[index] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def getcomments_step2(self, params): bookId = params.customized['bookId'] xhtml = XPathUtility(html=params.content) page_counts = int(xhtml.xpath('//div[@class="page"]/@pagenum')[0]) comments_count = int(xhtml.xpath('//div[@class="page"]/@total')[0]) Logger.getlogging().debug(comments_count) if page_counts == 0: return # 判断增量 cmtnum = CMTStorage.getcount(params.originalurl, True) if cmtnum >= comments_count: return page_num = int( math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages NewsStorage.setcmtnum(params.originalurl, comments_count) for page in range(1, page_num + 1, 1): comment_url = PubComments.COMMENTS_URL self.storeposturl(comment_url, params.originalurl, PubComments.STEP_3, { 'bookId': bookId, 'pageNum': page })
def geturlcomments(self, params): # 获取具体评论 xparser = XPathUtility(params.content) comments_xpath = xparser.xpath('//*[@id="short_comment_content"]') if not comments_xpath: return # 获取发布时间 ip_pubtimes_xpath = xparser.getlist('//*[@id="short_comment_left"]') if len(comments_xpath) == len(ip_pubtimes_xpath): comments = [] # 获取评论 for index in range(0, len(comments_xpath), 1): cmti = CommentInfo() publicTime = ip_pubtimes_xpath[index] if self.r.search(ur'\d{2}-\d+-\d+ \d+:\d+', publicTime): publicTime = '20' + self.r.parse(ur'\d{2}-\d+-\d+ \d+:\d+', publicTime)[0] if self.r.search(ur'\d+/\d+/\d+ \d+:\d+:\d+', publicTime): publicTime = self.r.parse(ur'\d+/\d+/\d+ \d+:\d+:\d+', publicTime)[0] if URLStorage.storeupdatetime(params.originalurl, getuniformtime(publicTime)): # 获取增加的评论(根据时间比较) cmti.content = comments_xpath[index].text comments.append(cmti)
def step2bbs(self, params): Logger.getlogging().info("Dm5Commnets.STEP_2") # 将STEP_1中的docurl传下来 docurl = params.customized['docurl'] comments_count = self.r.parse(ur'(\d+)个回复', params.content)[0] # 判断增量 cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= comments_count: return URLStorage.setcmtnum(params.originalurl, comments_count) # 总数除以page_size,然后加1,可得到评论总页数comments_count pagenum = 0 xparser = XPathUtility(params.content) if not xparser.xpath('//*[@class="inkk ma5"]'): Logger.getlogging().warning('{0}:30001'.format(params.originalurl)) return pageList = xparser.xpath('//*[@id="search_fy"]/a/text()') if not pageList: pagenum = 1 else: pagenum = int(pageList[-2]) for page in range(1, pagenum + 1, 1): comment_url = Dm5Commnets.COMMENT_URL.format(docurl=docurl, page=page) self.storeurl(comment_url, params.originalurl, Dm5Commnets.STEP_3_BBS)
def process(self, params): # 从搜索首页面中获取的搜索结果数量,生成搜索页面URL if params.step == LaohuS2Query.LAOHU_S2QUERY_FIRST_PAGE: # 获得首页url参数 KEY = params.customized['KEY'] time = params.customized['time'] #获取总页数 xparser = XPathUtility(params.content) pageCounts = xparser.getlist('//*[@id="main"]/div[2]/span') if len(pageCounts) > 0: page = str(pageCounts[0]).split('/')[1] #获取第一页的搜索结果 self.pageprocess(params) if int(page) > 1: if int(page) >= self.maxpages: page = self.maxpages querylist = [] # 根据总页数,获取query列表(第一页的数据已经获取到了,从第二页开始拼出获取的url) for pages in range(2, int(page) + 1, 1): url = LaohuS2Query.LAOHU_QUERY_TEMPLATE.format( KEY=KEY, pn=pages, time=time) querylist.append(url) self.__storeqeuryurllist__( querylist, LaohuS2Query.LAOHU_S2QUERY_EACH_PAGE, {'KEY': KEY}) else: Logger.getlogging().debug('抱歉,没有找到与' + ' ' + KEY + ' ' + '相关的帖子') # 从查询页面中获取视频URL elif params.step == LaohuS2Query.LAOHU_S2QUERY_EACH_PAGE: self.pageprocess(params)
def getsearchresult(self, params): info = params.customized['query'] xpath = XPathUtility(html=params.content) hrefs = xpath.xpath('//li/h3/a/@href') titles = xpath.getlist('//li/h3/a') pubtimes = xpath.xpath('//li/p') today = datetime.datetime.strptime( TimeUtility.getcurrentdate(), TimeUtility.DATE_FORMAT_DEFAULT).date() urllist = [] for index in range(0, len(titles), 1): # 标题中包含指定要查询的关键字 # if titles[index].find(info) > -1: if Common.checktitle(info, titles[index]): pubtimestr = TimeUtility.getuniformdate(pubtimes[index].text) pubtime = datetime.datetime.strptime( pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT).date() inteveral = today - pubtime # 时间在指定周期内 if inteveral.days <= self.querylastdays: urllist.append(hrefs[index]) else: # 因为是按照时间排序的,第一条时间不满足检索周期的话,后面所有的都不满足。 break if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def step3(self, params): Logger.getlogging().info("Flash8Comments.STEP_3") # Step3: 通过Step2设置的url,得到所有评论,抽取评论 page = params.customized['page'] xparser = XPathUtility(params.content) commentsinfo = xparser.getcomments('//td[@class="t_f"]') #commentstime = self.r.parse(ur'发表于 (\d+-\d+-\d+ \d+:\d+)</em>', params.content) commentstime = xparser.getcomments('//div[@class="authi"]/em') comments = [] # 获取评论 # 设置实际的评论量 if page is 1: statrIndex = 1 else: statrIndex = 0 for index in range(statrIndex, len(commentstime), 1): cmti = CommentInfo() if URLStorage.storeupdatetime(params.originalurl, commentstime[index]): # 获取增加的评论(根据时间比较) cmti.content = commentsinfo[index] comments.append(cmti) # 保存获取到的评论 if len(comments) > 0: self.commentstorage.store(params.originalurl, comments)
def bbs_step3(self, params): try: xparser = XPathUtility(params.content) page = params.customized['page'] pagecount = params.customized['pagecount'] comments = [] updatetimes = [] nicks = [] contents = xparser.getcomments('//*[@class="read"]') mid_times = xparser.getlist('//td[@class="authorname"]') for times in mid_times: updatetimes.append(self.r.parse(ur'于(\d+-\d+-\d+ \d+:\d+:\d+)留言', times)[0]) nicks.append(self.r.parse(ur'(.*)于', times)[0]) if page == 0: mid_index = 1 elif page > 0: mid_index = 0 comments_number = xparser.getnumber('//*[@id="msgsubject"]/font') if comments_number != 0: for index in range(mid_index, len(contents), 1): curtime = TimeUtility.getuniformtime(updatetimes[index]) content = contents[index] nick = nicks[index].split('于')[0].split('☆')[-1] if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) except Exception, e: traceback.print_exc()
def step1(self, params): Logger.getlogging().info("DmOneTwoThreeNewsComments.STEP_1") id = self.r.parse('^http://www.dm123.cn/.*/(\d+).html', params.originalurl)[0] xparser = XPathUtility(params.content) classid = xparser.xpath("//input[@id='classid']/@value")[0] # 1. 根据输入原始url, 拼出评论首页 commentinfo_url = Dm123NewsComments.COMMENT_URL.format(page=0, classid=classid, id=id) # 评论首页URL # 论坛 self.storeurl(commentinfo_url, params.originalurl, Dm123NewsComments.STEP_2, {'classid': classid, 'id': id})
def getcomments_step3(self, params): xhtml = XPathUtility(html=params.content) contents = xhtml.getlist('//*[contains(@id,"partThreadContent")]') curtimes = xhtml.getlist('//*[@class="comment_rw"]/span/em') nicks = xhtml.getlist('//*[@class="wzbox"]/h5') for index in range(0, len(contents), 1): curtime = TimeUtility.getuniformtime(curtimes[index]+':00') content = str(contents[index]) nick = str(nicks[index]) if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def step2bbs(self, params): Logger.getlogging().info("Ea3wcomments.STEP_2") commentinfo_url = params.customized['commentinfo_url'] + "&load=all" xparser = XPathUtility(params.content) comments_count = xparser.getnumber('//div[@class="at-comment"]/a/span') # 保存页面评论量 cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= comments_count: return URLStorage.setcmtnum(params.originalurl, comments_count) self.storeurl(commentinfo_url, params.originalurl, Ea3wcomments.STEP_3_BBS)
def pageprocess(self, params): # 获取文本 xparser = XPathUtility(params.content) # 获取该页超级链接 hreflist = xparser.xpath('//h3/a/@href') hrefs = [] for mid_url in hreflist: mid = self.preprocess(mid_url) if mid is not None: hrefs.append(mid) # 获取该页内容的所有发布时间 publictime = xparser.xpath('//*[@class="scontent"]/text()[1]') publicTimes = [] for timeindex in publictime: middle = str(timeindex).replace('\n', '').replace('\t', '').strip() publicTimes.append( str(str(middle).split(' ')[0]) + ' ' + str(str(middle).split(' ')[1])) # 获取改页所有title titles = [] titles_list = xparser.getlist('//h3') for title in titles_list: mid_title = str(title).replace('\n', '').replace('\t', '').strip() titles.append(mid_title) # 获取关键字 KEY_mid = params.customized['KEY'] KEY = Common.urldec(KEY_mid) # 获取标题正则表达式 titlePatten = KEY # 获取一周前日期 today = datetime.datetime.now() before_days = today + datetime.timedelta(-self.inputtime) before_arr = str(before_days).split('.') before_time = before_arr[0] urllist = [] len_hrefs = len(hrefs) number = 0 for index in publicTimes[:len_hrefs]: # 是否是标题命中 # mid_value = re.compile(titlePatten) # flg = mid_value.search(str(titles[number])) flg = Common.checktitle(titlePatten, str(titles[number])) # 是当前一周内发布视频,并且标题命中的场合 if index > before_time and flg: url = hrefs[number] urllist.append(url) number = number + 1 # 获取最终url列表 if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
def step1(self, params): if re.search('http://.*\.sohu\.com/', params.originalurl): cmttext = XPathUtility(params.content).getstring('//*[@class="c-num-red"][2]|//*[@id="changyan_parti_unit"]|//*[@class="remark-tit"]') if cmttext: try: cmtnum = re.findall('\d+', cmttext)[0] except: cmtnum = -1 else: cmtnum = -1 #cmtnum = NewsStorage.getcmtnum(params.originalurl) if int(cmtnum) == -1: pass elif int(cmtnum) == 0: Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_COMMNETS) return else: cmttext = XPathUtility(params.content).xpath('//*[@class="prompt-null-w"]') if cmttext: Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_COMMNETS) return liteloadApi = ChangyanComments.liteloadApi commentsApi = ChangyanComments.commentsApi # 取得client_id if re.match('http://\w+\.sohu\.com.*',params.originalurl): client_id = 'cyqemw6s1' elif re.match(r'^http://\w+\.(17173|shouyou|yeyou)\.com/.*',params.originalurl): client_id = 'cyqvqDTV5' elif re.match(r'^http://sports\.le\.com/.*', params.originalurl): client_id = 'cyrJ22d8v' #针对妆点网做特殊处理 elif re.match(r'^http://\w+\.zdface\.com.*', params.originalurl): client_id = 'cyrJOywnM' #http://xx.yzz.cn/xiuba/201609/1017135.shtml elif re.match(r'^http://\w+\.yzz\.cn.*', params.originalurl): client_id = 'cyrtYf3sa' elif re.match(r'^http://\w+\.178\.com.*', params.originalurl): client_id = 'cysrntF12' elif re.match(r'^http://.*\.cyol\.com/.*', params.originalurl): client_id = 'cys3X3zo9' else: client_id = self.r.getid('appid', params.content) topic_url = urllib.quote_plus(params.originalurl) #LITELOAD_URL = 'http://changyan.sohu.com/api/{liteloadApi}/topic/liteload?client_id={client_id}&topic_url={topic_url}&topic_source_id={topic_source_id}' topic_source_id = self.r.getid('sid',params.content) if not topic_source_id: topic_source_id = self.r.getid('data-widget-sid', params.content) comment_url = ChangyanComments.LITELOAD_URL.format(liteloadApi=liteloadApi, client_id=client_id, topic_url=topic_url, topic_source_id=topic_source_id) self.storeurl(comment_url, params.originalurl, ChangyanComments.STEP_2, {'client_id': client_id, 'liteloadApi':liteloadApi, 'topic_url':topic_url, 'commentsApi':commentsApi})
def getComments(self, params): xhtml = XPathUtility(html=params.content) commentinfo = xhtml.getlist('//*[@class="recTxt"]') updatetimes = xhtml.getlist('//*[@class="comment-time"]') comments = [] for index in range(0, commentinfo.__len__()): curtime = TimeUtility.getuniformtime(updatetimes[index][1:updatetimes[index].__len__() - 1]) content = commentinfo[index] nick = 'nick' if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick) if len(comments) > 0: self.commentstorage.store(params.originalurl, comments)
def geturlcomments(self, params): xparser = XPathUtility(params.content) # 取回所有评论 page = params.customized['page'] if page == 1: commentstimes = xparser.getcomments( '//table[position()>1]/tbody/tr/td/span[1]') commentscontents = xparser.getcomments( '//table[position()>1]/tbody/tr[2]/td[@class="post-main"]') commentsnicks = xparser.getcomments('//*[@class="name"]/a') else: commentstimes = xparser.getcomments('//table/tbody/tr/td/span[1]') commentscontents = xparser.getcomments( '//table/tbody/tr[2]/td[@class="post-main"]') commentsnicks = xparser.getcomments('//*[@class="name"]/a') # 设置实际的评论量 for index in range(0, len(commentscontents), 1): curtime = TimeUtility.getuniformtime(commentstimes[index][4:]) # 提取评论内容 content = commentscontents[index].strip() nick = commentsnicks[index].strip() if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def getcomments_step3(self, params): xhtml = XPathUtility(html=params.content) contents = xhtml.getlist('//*[@class="wz"]/p') curtimes = xhtml.getlist('//*[@class="fr"]') nicks = xhtml.getlist('//*[@class="wzbox"]/h5') for index in range(0, len(contents), 1): curtime = curtimes[index][4:] + ':00' Logger.getlogging().debug(contents[index]) content = str(contents[index]) nick = str(nicks[index]) if not CMTStorage.exist(params.originalurl, content, curtime, nick): CMTStorage.storecmt(params.originalurl, content, curtime, nick)
def step1(self, params): """""" #Step1: 通过得到docurl,得到获取评论的url的参数。 #docurl = self.r.parse('^http://bbs\.hupu\.com\/(\d+)', params.originalurl) docurl = self.r.parse('^http[s]{0,1}://bbs\.hupu\.com\/(\d+)', params.originalurl) if docurl: docurl = docurl[0] else: Logger.getlogging().debug( '{url}:20000'.format(url=params.originalurl)) return # 取得正文 xparser = XPathUtility(params.content) #取得页数 pageList = xparser.getcomments('//div[@class="page"]/a') if not pageList: pagenum = 1 elif pageList: pagenum = pageList[-2] else: return if int(pagenum) >= self.maxpages: pagenum = self.maxpages # 评论总数 curcmtnum = xparser.getnumber('//span[@class="browse"]') NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return start = int(dbcmtnum / self.page_size) + 1 end = int(pagenum) if end > start + self.maxpages: start = end - self.maxpages params.customized['page'] = 1 if end == 1: self.step2(params) return if start == 1: self.step2(params) comment_url = self.COMMENT_URL.format(docurl=docurl, page=end) self.storeurl(comment_url, params.originalurl, hupuComments.STEP_1_2, { 'docurl': docurl, 'page': end, 'start': start, 'end': end })
def step1(self, params): pattern = 'https://www.huxiu.com/article/(\d+).html' if not self.r.search(pattern, params.originalurl): Logger.log(params.url, constant.ERRORCODE_SITE_NOGET_TEMPLATE) return else: object_id = self.r.parse(pattern, params.originalurl)[0] curcmtnum = XPathUtility( params.content).getnumber('//*[@class="article-pl pull-left"]') if not curcmtnum: Logger.log(params.url, constant.ERRORCODE_SITE_NOGET_COMMNETS) return NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return # 循环取得评论的url pages = int(math.ceil(float(curcmtnum - dbcmtnum) / self.page_size)) if pages >= self.maxpages: pages = self.maxpages for page in range(1, pages + 1): #self.POST_DATA['object_id'] = object_id #self.POST_DATA['page'] = page #self.storeposturl(self.POST_URL, params.originalurl, HuxiupostComments.EACH, self.POST_DATA) commonurl = self.COMMONURL.format(object_id=object_id, page=page) self.storeurl(commonurl, params.originalurl, HuxiupostComments.EACH)
def get_url_id(self, params): """只适用在腾讯视频的部分""" "cid是电视剧\合集\电影,vid单集" CID_PATTERN = 'https?://v\.qq\.com/x/cover/(\w+).html' CID_URL = 'https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&op=3&cid={cid}' VID_PATTERN1 = 'https?://v\.qq\.com/x/cover/\w+/(\w+).html' VID_PATTERN2 = 'https?://v\.qq\.com/x/page/(\w+)\.html' VID_URL = 'https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&op=3&vid={vid}' if self.r.search(CID_PATTERN, params.originalurl): cid = self.r.parse(CID_PATTERN, params.originalurl)[0] url = CID_URL.format(cid=cid) self.storeurl(url, params.originalurl, self.STEP_COMMENT_FIRST_PAGE) elif self.r.search(VID_PATTERN1, params.originalurl): vid = self.r.parse(VID_PATTERN1, params.originalurl)[0] url = VID_URL.format(vid=vid) self.storeurl(url, params.originalurl, self.STEP_COMMENT_FIRST_PAGE) elif self.r.search(VID_PATTERN2, params.originalurl): vid = self.r.parse(VID_PATTERN2, params.originalurl)[0] url = VID_URL.format(vid=vid) self.storeurl(url, params.originalurl, self.STEP_COMMENT_FIRST_PAGE) #publish_date publish_date = self.r.getid('publish_date', params.content, split=':') if not publish_date: publish_date = XPathUtility(params.content).getstring( '//*[@class="video_tags"]/span|//*[@class="date"]|//*[@class="tag_item"]' ) publish_date = TimeUtility.getuniformtime(publish_date) if publish_date: NewsStorage.setpublishdate(params.originalurl, publish_date) self.setclick(params)
def process(self, params): # S2 Query Process if SPIDER_CHANNEL_S2 == SpiderConfigure.getinstance().getchannel(): if SPIDER_S2_WEBSITE_TYPE not in params.customized: return True xparser = XPathUtility(params.content) maxitmes = 0 pageinfo = PageBasicInfo() template = None for template in TemplateManager.getxpaths(params.url): Logger.getlogging().debug('URL_TEMPLATE {url}\t{template}'.format( url=params.url, template=template[TemplateManager.XPATH_KEY_URL_TEMPLATE])) pageinfo, items = self.parsefromcontent(params, template, xparser) if constant.SPIDER_S2_WEBSITE_TYPE in params.customized: pageinfo.type = params.customized[ constant.SPIDER_S2_WEBSITE_TYPE] #if not params.page_title and not pageinfo.title and not params.lastretry: #return False if template is None: Logger.log(params.url, constant.ERRORCODE_SITE_NOGET_TEMPLATE) #值覆盖 pageinfo.url = params.url if not pageinfo.title: pageinfo.title = params.page_title if not pageinfo.body: pageinfo.body = params.page_body if not pageinfo.pubtime: pageinfo.pubtime = params.html_time NewsStorage.seturlinfos(pageinfo)
def step1(self, params): Logger.getlogging().info("LaohuComments.STEP_1") #1. 根据输入原始url, 得到网站的子域名 field = self.r.parse('^http://(\w+)\.laohu\.com/.*', params.originalurl)[0] # 论坛 if field == 'bbs': # 拼接获取uniqid的url self.storeurl(params.originalurl, params.originalurl, LaohuComments.STEP_2_BBS,{'field': field}) else: # 非论坛页面 http://ff.laohu.com/201612/215072.html xhtml = XPathUtility(html=params.content) token = xhtml.getlist('// *[ @ id = "t_token"]')[0] sourceId = self.r.getid('source_id', params.content, '\s*=\s*') # 拼接第一页评论url COMMENTS_URL = 'http://member.laohu.com/comment/show/?token=%s&oder=new' comments_url = LaohuComments.COMMENTS_URL % (token) # 通知下载平台,根据评论url获取第一页评论内容 self.storeurl(comments_url, params.originalurl, LaohuComments.STEP_2, {'token' : token, 'sourceId':sourceId})
def bbs_step2(self, params): try: xparser = XPathUtility(params.content) comment_counts = int(xparser.getnumber('//*[@id="msgsubject"]/font')) if comment_counts == 0: return cmtnum = CMTStorage.getcount(params.originalurl, True) # 判断增量 if cmtnum >= comment_counts: return pagecount = xparser.getnumber('//*[@id="pager_top"]') for page in range(0, pagecount + 1, 1): commentUrl = JjwxcBbsComments.COMMENTS_URL.format(url=params.originalurl, pageno=page) Logger.getlogging().debug(commentUrl) self.storeurl(commentUrl, params.originalurl, JjwxcBbsComments.BBS_NEXT_PAGE, {'page': page,'pagecount':pagecount}) NewsStorage.setcmtnum(params.originalurl, comment_counts) except Exception, e: traceback.print_exc()