def step2(self, params): """""" q = params.customized['query'] soup = BeautifulSoup(params.content, 'html5lib') divs = soup.select('.videobox') if not divs: Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_COMMNETS) return urllist = [] for div in divs: title = div.select_one('.title').get_text() #print title tm = getuniformtime(div.select_one('.date').get_text()) url = div.select_one('.title > a').get('href') Logger.getlogging().debug(title) if not compareNow(tm, self.querylastdays): Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME) continue if not Common.checktitle(Common.urldec(q), title): Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTITLE) continue urllist.append(url) #获取最终url列表 if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def step2(self,params): soup = BeautifulSoup(params.content, 'html5lib') if soup.find(attrs={"id":re.compile('noresult_part._container')}): Logger.getlogging().warning('{url}:40000 No results'.format(url=params.url)) return results = soup.select('.results > .vrwrap') if not results: Logger.getlogging().warning('{url}:40000 No results'.format(url=params.url)) return urllist = [] newurllist = [] for item in results: try: if not item.select_one('h3.vrTitle > a'): continue title = item.select_one('h3.vrTitle > a').get_text() href = item.select_one('h3.vrTitle > a').get('href') timestr = item.select_one('.news-detail > .news-info > .news-from').get_text() times = getuniformtime(timestr) Logger.getlogging().debug('title:'+ title) Logger.getlogging().debug('time:'+ times) if compareNow(times, self.querylastdays): Logger.getlogging().debug('href:'+ href) urllist.append(href) newitem = item.select_one('#news_similar') if newitem: newhref = 'http://news.sogou.com/news'+newitem.get('href') Logger.getlogging().debug('newhref:'+ newhref) newurllist.append(newhref) except: Logger.printexception() if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_NEWS) if len(newurllist) > 0: self.__storeqeuryurllist__(newurllist, self.NEWS_EACH_2)
def step2(self, params): """""" try: key = params.customized['key'] soup = BeautifulSoup(params.content, 'html5lib') #print soup #searchListOne = soup.select('.searchListOne > ul') searchListOne = soup.select('.searchListOne > ul > li > div') if not searchListOne: Logger.getlogging().warning('{}:40000 No urllist'.format( params.originalurl)) return lis = soup.select( '.searchListOne > ul > li' )[:-1] #最后一个<li id=search_msg style="display:none"></li>,过滤掉 urllist = [] for li in lis: url = li.select_one('h3 > a').get('href') #print '*********',url tm = li.select('.source > span')[0].get_text() tm = getuniformtime(tm) now = getuniformtime(str(time.time())) cmt_num = li.select('.source > span')[-1].get_text() title = li.select_one('h3').get_text() if Common.checktitle(Common.urldec(key), title): if compareNow(tm, self.querylastdays): urllist.append(url) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA) except: #traceback.print_exc() Logger.printexception() Logger.getlogging().error( 'extract comment error from {site}'.format(site=params.url))
def process(self,params): if params.step == S2Query.STEP_1: html=etree.HTML(params.content) #try: #quit=html.xpath['//div[@id="results"]/text()'] #totalpage='0' #except: #totalpage=html.xpath('//div[@class="page"]/span/text()')[0] #totalpage= totalpage.split("/")[-1] #totalpage=re.sub("\D", "",totalpage) results = html.xpath('//*[@id="results"]') if not results: return totalpage=html.xpath('//*[@id="div_3"]/*[@class="page"]/span/text()') if totalpage: totalpage = self.r.parse('(\d+)',totalpage[0].split('/')[-1])[0] else: Logger.getlogging().info("there are no results you want!") return urllist=[] if int(totalpage) >= self.maxpages: totalpage = self.maxpages if totalpage <>'0': for pages in range(0,int(totalpage)): searchurl = S2Query.S2_URL % (pages+1,params.customized['key']) urllist.append(searchurl) self.__storeqeuryurllist__(urllist, S2Query.STEP_2,{'key':params.customized['key']}) else: return elif params.step == S2Query.STEP_2: comquerkey=Common.urldec(params.customized['key']).decode('gbk').encode('utf-8') soup = BeautifulSoup(params.content,'html5lib') urllist = [] divs = soup.find_all(attrs={'class':'result f s0'}) if not divs: return for div in divs: title = div.select_one('h3.c-title').get_text() title = ''.join(title.strip().split()) url_tm = div.select_one('.c-showurl').get_text() tm = getuniformtime(url_tm.split('/')[-1]) url = 'http://'+'/'.join(url_tm.split('/')[0:-1]) Logger.getlogging().debug(title) #Logger.getlogging().debug(url_tm) if not Common.checktitle(comquerkey, title): Logger.getlogging().warning('{url}:40000 out of range, the title!'.format(url=params.originalurl)) continue if not compareNow(tm, self.querylastdays): Logger.getlogging().warning('{url}:40000 out of range, the time!'.format(url=params.originalurl)) continue urllist.append(url) self.__storeurllist__(urllist,SPIDER_S2_WEBSITE_VIDEO)
def step3(self, params): """获取新闻类的url列表""" key = Common.urldec(params.customized['query']) soup = BeautifulSoup(params.content, 'html5lib') lis = soup.select('.wzlist > ul > li.wztitle') if lis: urllist = [] for li in lis: title = li.select_one('a').get_text() # if key not in title: if not Common.checktitle(key, title): continue pubtime = li.select_one('span').get_text() url = 'http://www.52tian.net' + li.select_one('a').get('href') if compareNow(getuniformtime(pubtime), self.querylastdays): urllist.append(url) self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def baidutiebasearch_step3(self, params): content = '' p = '<!--[\s\S]{0,}(<ul id="thread_list".*[\s\S]{0,})--></code><script>' if re.search(p, params.content): content = re.findall(p, params.content)[0] if not content: Logger.log(params.url, constant.ERRORCODE_WARNNING_NORESULTS) return soup = BeautifulSoup(content, 'html5lib') #print soup top_list = soup.select('#thread_top_list > li.j_thread_list') thread_list = soup.select('#thread_list > li.j_thread_list') urllist = [] for item in top_list + thread_list: #print item try: pubtimeobj = item.find(attrs={ 'class': 'threadlist_reply_date pull_right j_reply_data' }) if not pubtimeobj: pubtimeobj = item.find( attrs={'class': 'pull-right is_show_create_time'}) pubtime = pubtimeobj.get_text().strip().replace(' ', '') href = item.select_one('.threadlist_title > a').get('href') title = item.select_one('.threadlist_title > a').get('title') Logger.getlogging().debug(title) Logger.getlogging().debug(pubtime) pubtime = self.getuniformtime(pubtime) Logger.getlogging().debug(pubtime) if self.isyestoday(pubtime): pubtime2obj = item.find( attrs={'class': 'pull-right is_show_create_time'}) if pubtime2obj: pubtime2 = self.getuniformtime(pubtime2obj.get_text()) if not gettimeutil.compareNow(pubtime2, self.querylastdays): continue Logger.getlogging().debug('https://tieba.baidu.com' + href) urllist.append('https://tieba.baidu.com' + href) except: Logger.printexception() if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
def step2(self, params): """""" info = params.customized['query'] info = Common.urldec(info) soup = BeautifulSoup(params.content, 'html5lib') videos = soup.select('.uiVideo > .uiVideo__item') if videos: urllist = [] for video in videos: title = video.select_one('h3 > a').get('title') pubtime = video.select('.result__data > span')[-1].get_text() url = video.select_one('h3 > a').get('href') # if not info in title: if compareNow(getuniformtime(pubtime), self.querylastdays): if Common.checktitle(info, title): urllist.append(url) else: Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTITLE) else: Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME) self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def step2(self, params): """""" try: key = params.customized['key'] key = Common.urldec(key) soup = BeautifulSoup(params.content, 'html5lib') books = soup.select('#searchResult > .book') if books: urllist = [] for book in books: title = book.select_one('h3 > a').get_text() if key not in title: continue pubtime = book.select('.w_auth')[-2].get_text() url = book.select_one('h3 > a').get('href') if compareNow(getuniformtime(pubtime), self.querylastdays): urllist.append(url) self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_NEWS) except: Logger.printexception() Logger.getlogging().error( 'extract comment error from {site}'.format(site=params.url))
def getsearchresult(self, params): info = params.customized['query'] soup = BeautifulSoup(params.content, 'html5lib') lis = soup.select('ul.ckl_cktpp > li.cfix') urllist = [] if lis: for li in lis: title = li.select_one('h3').get_text() # if info not in title: if not Common.checktitle(info, title): continue times = li.select('p')[-2].get_text() times = getuniformtime(times) url = li.select_one('h3 > a').get('href') if compareNow(times, self.querylastdays): urllist.append(url) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO) return len(urllist) else: return -1
def pageprocess(self, params): # Step3: 根据Step2的返回jason数据,获取 # 标题:jsondata['data'][0开始到19]['title'] # 连接:jsondata['data'][0开始到19]['url'] # 视频发布时间:jsondata['data'][0开始到19]['modifydatel'] 这个需要截断前10位,只能对比日期 info = params.customized['query'] jsondata = json.loads(params.content) searchresult = jsondata['data'] urllist = [] for result in searchresult: title = result['title'] url = result['url'] pubtime = result['modifydate'] # if not info in title: if compareNow(getuniformtime(pubtime), self.querylastdays): if Common.checktitle(info, title): urllist.append(self.MAIN_DOMAIN + url) else: Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTITLE) else: Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME) self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def step2(self, params): """""" query = params.customized['query'] soup = BeautifulSoup(params.content, 'html.parser') trs = soup.select('#schend') if not trs: return urllist = [] for tr in trs: title = tr.select_one('.sb14b').get_text() content = etree.HTML(str(tr)) publicTimes = content.xpath( '//*[@id="schend"]/table[1]/tr/td[3]/text()')[-1].strip() href = tr.select_one('.sb14b').get('href') id = re.findall('id=(\d+)&', href)[0] url = 'http://forum.home.news.cn/detail/' + id + '/1.html' if not compareNow(getuniformtime(publicTimes), self.querylastdays): continue if not Common.checktitle(Common.trydecode(query), Common.trydecode(title)): continue urllist.append(url) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
def process(self, params): if params.step == NarutomS2Query.S2QUERY_FIRST_PAGE: #Step2: 根据返回的url,通过xpath://*[@id="results"]/span 得到搜索结果总件数,根据总件数,拼出搜索结果的url,写文件保存 html = etree.HTML(params.content) nodes = html.xpath('//*[@id="results"]/span') # 获取不到,则返回 if len(nodes) == 0: return # 获取总检索件数(例如:为您找到相关结果1,307个) count = 0 totalstr = nodes[0].text.replace(',', '') if self.r.search(u'\d+', totalstr): countstr = self.r.parse(u'(\d+)', totalstr)[0] count = int(countstr) # 该网站最多能获得750件检索结果 if count > self.MAX_COUNT: count = self.MAX_COUNT else: return # 根据上面的count数,拼出所有的搜索结果url info = params.customized['query'] keyvalue = Common.urlenc(info) page_count = float(count / self.DEFAULT_PAGE_SIZE) firstpage = NarutomS2Query.FIRST_PAGE.format(key=keyvalue) querylist = [] querylist.append(firstpage) if count > 10: #第二页的page数是1,第三页是2...... page数范围是:1-74(表示第2页-第75页) for page in range(1, int(math.ceil(page_count)), 1): url = NarutomS2Query.QUERY_TEMPLATE.format(key=keyvalue, pageno=page) querylist.append(url) self.__storeqeuryurllist__(querylist, NarutomS2Query.S2QUERY_EACH_PAGE, {'query': info}) elif params.step == NarutomS2Query.S2QUERY_EACH_PAGE: # Step3:根据Step2的返回结果,通过xpath: //*[@id="results"]/div/h3/a/@href 获得搜索结果的url,把url写入文件 info = params.customized['query'] html = etree.HTML(params.content) nodes = html.xpath('//*[@id="results"]/div/h3/a/@href') #titles = html.xpath('//*[@id="results"]/div/h3/a') pubtimestr = html.xpath('//*[@class="c-showurl"]') datecheck = False if len(pubtimestr) == len(nodes): datecheck = True urllist = [] for index in range(0, len(nodes), 1): # if titles[index] is not None and titles[index].find(info) > -1: # if titles[index] is not None and Common.checktitle(info, titles[index]): # 标题中包含指定要查询的关键字,对应的url保存 if datecheck: # 如果xpath获取到了包含时间的字符串,检查时间 if self.r.search('(\d+-\d+-\d+)', pubtimestr[index].text): pubtime = getuniformtime( self.r.parse('(\d+-\d+-\d+)', pubtimestr[index].text)[0]) if compareNow(pubtime, int(self.querylastdays)): urllist.append(nodes[index]) else: urllist.append(nodes[index]) ''' urllist = [] for node in nodes: urllist.append(node) ''' if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def processVideo(self, params): if params.step == MofangS2Query.S2QUERY_FIRST_PAGE: #Step2: 根据返回json内容,comments['totalnums'] 得到视频总数 #一个json返回20条数据,需要使用总数除以20获得总页数,然后在写到page参数里面 info = params.customized['query'] keyvalue = Common.urlenc(info) try: jsondate = json.loads(params.content) comments_count = jsondate['totalnums'] except: Logger.getlogging().warning('{}:40000'.format(params.url)) return # 获取不到,则返回 if int(comments_count) == 0: return page_count = int( math.ceil(float(comments_count) / self.DEFAULT_PAGE_SIZE)) # 根据上面的page_count数,拼出所有的搜索结果url(最新1周) querylist = [] if page_count > 0: for page in range(1, page_count + 1, 1): url = MofangS2Query.QUERY_TEMPLATE.format( key=keyvalue, pageno=page, pagesize=self.DEFAULT_PAGE_SIZE) Logger.getlogging().debug(url) querylist.append(url) self.__storeqeuryurllist__(querylist, MofangS2Query.S2QUERY_EACH_PAGE, {'query': info}) elif params.step == MofangS2Query.S2QUERY_EACH_PAGE: # Step3: 根据Step2的返回jason数据,获取 # 标题:comments['data'][0开始到19]['title'] # 连接:comments['data'][0开始到19]['url'] # 视频发布时间:comments['data'][0开始到19]['inputtime'] 这个需要截断前10位,只能对比日期 info = params.customized['query'] try: jsondate = json.loads(params.content) searchresult = jsondate['data'] except: Logger.getlogging().warning('{}:40000'.format(params.url)) return # 获取当前日(日期类型) today = datetime.datetime.strptime(TimeUtility.getcurrentdate(), TimeUtility.DATE_FORMAT_DEFAULT) urllist = [] for index in range(0, len(searchresult), 1): #print searchresult[index]['title'] #print searchresult[index]['inputtime'] if searchresult[index]['title'] is not None: # 标题中包含指定要查询的关键字,对应的url保存 # if searchresult[index]['title'].find(info) > -1: if Common.checktitle(info, searchresult[index]['title']): if searchresult[index]['inputtime'] is not None: #inputtime = datetime.datetime.strptime(TimeUtility.getuniformtime2(int(searchresult[index]['inputtime'])), TimeUtility.TIME_FORMAT_DEFAULT) #intervaldays = today - inputtime #if intervaldays.days <= int(self.querylastdays): pubtime = getuniformtime( str(searchresult[index]['inputtime'])) if compareNow(pubtime, int(self.querylastdays)): urllist.append(searchresult[index]['url']) else: # 获取不到发布时间,则默认为周期以内 urllist.append(searchresult[index]['url']) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def getpagecomments_step2(self, params): try: page = params.customized['page'] soup = BeautifulSoup(params.content, "html5lib") d_post_content_main = soup.select('#j_p_postlist > div.j_l_post') if page == 1: main_item = d_post_content_main[0] #print main_item pubtimes = '' pubtimesobj = main_item.select('.tail-info') if pubtimesobj: pubtimes = getuniformtime( pubtimesobj[-1].get_text().strip()) else: pubtimeslist = re.findall('\d+-\d+-\d+ \d+:\d+', str(main_item)) if pubtimeslist: pubtimes = getuniformtime(pubtimeslist[0]) if pubtimes: NewsStorage.setpublishdate(params.originalurl, pubtimes) if not compareNow(pubtimes, self.COMMENT_LIMIT_DAYS): Logger.log(params.originalurl, constant.ERRORCODE_WARNNING_NOMATCHTIME) #超过7天的帖子,不在取回复/评论了 return False d_post_content_main = d_post_content_main[1:] comments = [] for item in d_post_content_main: try: comment = item.find( attrs={'id': re.compile("post_content")}) if not comment: continue content = comment.get_text().strip() pubtimes = '' pubtimesobj = item.select('.tail-info') if pubtimesobj: pubtimes = getuniformtime( pubtimesobj[-1].get_text().strip()) else: pubtimeslist = re.findall('\d+-\d+-\d+ \d+:\d+', str(item)) if pubtimeslist: pubtimes = getuniformtime(pubtimeslist[0]) if not pubtimes: if not CMTStorage.exist(params.originalurl, content, TimeUtility.getdatebefore(0), 'nick'): CMTStorage.storecmt(params.originalurl, content, TimeUtility.getdatebefore(0), 'nick') continue #判断评论是否是前一天的 Logger.getlogging().debug(pubtimes) if self.isyestoday(pubtimes): if not CMTStorage.exist(params.originalurl, content, pubtimes, 'nick'): CMTStorage.storecmt(params.originalurl, content, pubtimes, 'nick') except: Logger.printexception() return True except: Logger.printexception() return False
def process(self, params): if params.step == HuyaS2Query.S2QUERY_FIRST_PAGE: Logger.getlogging().debug("HuyaS2Query.S2QUERY_FIRST_PAGE") #Step2: 根据返回的html,通过xpath://*[@id="tab1"]/div[1]/div/span/em 得到搜索结果总件数 # 根据总件数,计算出page总数(总件数/20件,除不尽向上取整)拼出搜索结果的url,写文件保存 soup = BeautifulSoup(params.content, 'html5lib') if soup.select('.search-no-data-wrap'): return # 获取不到,则返回 totalstr = soup.select_one('.search-list > .mod-tab-hd > .act') if not totalstr: return # 获取总检索页数(例如:160) totalstr = totalstr.get_text().replace(',', '') count = int(re.findall('\d+', totalstr)[0]) # 根据上面的count数,拼出所有的搜索结果url info = params.customized['query'] keyvalue = Common.urlenc(info) querylist = [] pagecount = float(count) / self.DEFAULT_PAGE_SIZE pages = int(math.ceil(pagecount)) if pages >= self.maxpages: pages = self.maxpages for page in range(1, pages + 1, 1): url = HuyaS2Query.QUERY_TEMPLATE.format(pageno=page, key=keyvalue) Logger.getlogging().debug(url) querylist.append(url) self.__storeqeuryurllist__(querylist, HuyaS2Query.S2QUERY_EACH_PAGE, {'query': info}) elif params.step == HuyaS2Query.S2QUERY_EACH_PAGE: info = params.customized['query'] soup = BeautifulSoup(params.content, 'html5lib') if soup.select('.search-no-data-wrap'): return divs = soup.select('ul.video-list') if divs: divs = divs[-1] divs = divs.select('li') if not divs: return urllist = [] for div in divs: video = div.select_one('.video-title > .video-wrap') timestr = div.select_one('.result-data') times = getuniformtime(timestr.get_text()) titles = video.get('title') url = video.get('href') if compareNow(times, self.querylastdays) and Common.checktitle( info, titles): Logger.getlogging().debug(titles) Logger.getlogging().debug(url) urllist.append(url) else: Logger.getlogging().debug( titles + ' not match title or out of time') if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def process(self, params): if params.step == Cine107S2Query.S2QUERY_FIRST_PAGE: #Step2: 根据返回内容,得到搜索结果总数 info = params.customized['query'] soup = BeautifulSoup(params.content, 'html5lib') # 搜素结果 #print soup results = soup.select('#results')[0] # 无查找结果,则返回 if results.get_text().find('抱歉') > -1: return else: resultStr = results.select('span.support-text-top')[0].get_text().strip() resultStr = resultStr[8:resultStr.index('个')] if resultStr.find(',') > -1: result_counts = int(resultStr.replace(',', '')) else: result_counts = int(resultStr) Logger.getlogging().debug(result_counts) #搜索结果只能查看75[0:74]页,如果超过75页,按照75页处理 if result_counts > 750: result_counts = 750 #计算出循环页数page_count if result_counts < 10: page_count = 0 else: page_count = int(math.ceil(result_counts / Cine107S2Query.DEFAULT_PAGE_SIZE)) # 根据上面的page_count数,拼出所有的搜索结果url querylist = [] for page in range(0, page_count): url = Cine107S2Query.QUERY_TEMPLATE.format(key=info, pageno=page) querylist.append(url) self.__storeqeuryurllist__(querylist, Cine107S2Query.S2QUERY_EACH_PAGE, {'query': info}) elif params.step == Cine107S2Query.S2QUERY_EACH_PAGE: # Step3: 根据Step2的url,获取搜索结果的url,把url写入文件 info = params.customized['query'] soup = BeautifulSoup(params.content, 'html.parser') titles = soup.select('h3.c-title') times = soup.select('span.c-showurl') urllist = [] index = 0 for result in titles[0:]: title = result.get_text().strip() nodeUrl = result.select('a')[0].get('href') timeStr = times[index].get_text().strip() if timeStr.find('html') > -1: timeStr = timeStr[timeStr.index('html') + 5:] elif timeStr.find('...') > -1: timeStr = timeStr[timeStr.index('...')+4:] # 标题中包含指定要查询的关键字,并且是一周内的帖子,对应的url保存 if self.r.search(ur'(\d+-\d+-\d+)', timeStr): timeStr = self.r.parse(ur'(\d+-\d+-\d+)', timeStr)[0] #if title.find(Common.urldec(info)) > -1 and TimeUtility.getuniformtime(timeStr, '%Y-%m-%d') > TimeUtility.getuniformdatebefore(7): #urllist.append(nodeUrl) #Logger.getlogging().debug(title) #Logger.getlogging().debug(nodeUrl) #Logger.getlogging().debug(Common.urldec(info)) #Logger.getlogging().debug(title) if compareNow(getuniformtime(timeStr),self.querylastdays): if Common.checktitle(Common.urldec(info), title): urllist.append(nodeUrl) index += 1 if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)