def step2(self, params): info = Common.urldec(params.customized['info']) soup = BeautifulSoup(params.content, 'html5lib') text_divs = soup.select('.s_r_txt') urllist = [] if text_divs: for item in text_divs: title = item.select_one('h3 > a').get_text() url = item.select_one('h3 > a').get('href') curtime = item.select('p')[-1].get_text().strip() try: if TimeUtility.compareNow( TimeUtility.getuniformtime(curtime), self.querylastdays): if Common.checktitle(info, title): urllist.append(url) else: Logger.log( url, constant.ERRORCODE_WARNNING_NOMATCHTITLE) else: Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME) except: urllist.append(url) self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def step2(self, params): key = params.customized['query'] query = Common.urldec(key) soup = BeautifulSoup(params.content, 'html5lib') tbody = soup.select('.search_topic_list > form > table > tbody') lis = tbody[-1].select('tr') urllist = [] for li in lis: url = li.select_one('.p_title > a').get('href') title = li.select_one('.p_title > a').get_text() curtime = li.select('td')[3].get_text() if TimeUtility.compareNow(TimeUtility.getuniformtime(curtime), self.querylastdays): if Common.checktitle(query, title): urllist.append(url) else: Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTITLE) else: Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
def step_last(self, params): urllist = [] info = params.customized['query'] soup = BeautifulSoup(params.content, 'html5lib') divs = soup.select('#results > .result') for div in divs: publish = div.select_one('.c-summary-1').get_text() title = div.select_one('h3 > a').get_text().strip() url = div.select_one('h3 > a').get('href').strip() #url = self.preprocess(href) if TimeUtility.compareNow(TimeUtility.getuniformtime(publish), self.querylastdays): if Common.checktitle(Common.trydecode(info), Common.trydecode(title)): urllist.append(url) else: Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTITLE) else: Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
def step2(self, params): """解析每一搜索页面""" info = params.customized['info'] soup = BeautifulSoup(params.content, 'html5lib') divs = soup.select('.pbw') #divs = soup.select('h3.xs3 > a') if not divs: return urllist = [] for div in divs: tm = div.select('p > span')[0].get_text() tm = TimeUtility.getuniformtime(tm) geturl = div.select_one('h3.xs3 > a').get('href') title = div.select_one('h3.xs3 > a').get_text() if not re.search('http://.*com.*', geturl): if re.search('(http://.*com).*', params.url): urltemp = re.findall('(http://.*com).*', params.url)[0] elif re.search('(http://.*cn).*', params.url): urltemp = re.findall('(http://.*cn).*', params.url)[0] elif re.search('(http://.*net).*', params.url): urltemp = re.findall('(http://.*net).*', params.url)[0] geturl = urltemp + '/' + geturl if re.search('(http.*)&highlight', geturl): geturl = re.findall('(http.*)&highlight', geturl)[0] Logger.getlogging().info(Common.trydecode(title)) #to compare time and match title if not TimeUtility.compareNow(tm, self.querylastdays): Logger.log(geturl, constant.ERRORCODE_WARNNING_NOMATCHTIME) continue if not Common.checktitle(Common.trydecode(info), Common.trydecode(title)): Logger.log(geturl, constant.ERRORCODE_WARNNING_NOMATCHTITLE) continue #print geturl urllist.append(geturl) self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
def step2(self, params): key = params.customized['key'] query = Common.urldec(key) soup = BeautifulSoup(params.content, 'html5lib') lis = soup.select('.sresult > ul > li') urllist = [] for li in lis: url = li.select_one('.stitle > a').get('href') title = li.select_one('.stitle').get_text() curtime = li.select_one('.scontent').get_text() if TimeUtility.compareNow(TimeUtility.getuniformtime(curtime), self.querylastdays): if Common.checktitle(query, title): urllist.append('http://bbs.tgbus.com/' + url) else: Logger.log('http://bbs.tgbus.com/' + url, constant.ERRORCODE_WARNNING_NOMATCHTITLE) else: Logger.log('http://bbs.tgbus.com/' + url, constant.ERRORCODE_WARNNING_NOMATCHTIME) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
def compareNow(curtime, days): """时间t比现在小多少天""" return TimeUtility.compareNow(curtime, days)
def compareNow(self, curtime, days=None): if not days: days = self.querylastdays return TimeUtility.compareNow(TimeUtility.getuniformtime(curtime), days)