Пример #1
0
    def step2(self, params):
        info = Common.urldec(params.customized['info'])
        soup = BeautifulSoup(params.content, 'html5lib')
        text_divs = soup.select('.s_r_txt')
        urllist = []

        if text_divs:
            for item in text_divs:
                title = item.select_one('h3 > a').get_text()
                url = item.select_one('h3 > a').get('href')
                curtime = item.select('p')[-1].get_text().strip()
                try:
                    if TimeUtility.compareNow(
                            TimeUtility.getuniformtime(curtime),
                            self.querylastdays):
                        if Common.checktitle(info, title):
                            urllist.append(url)
                        else:
                            Logger.log(
                                url, constant.ERRORCODE_WARNNING_NOMATCHTITLE)
                    else:
                        Logger.log(url,
                                   constant.ERRORCODE_WARNNING_NOMATCHTIME)
                except:
                    urllist.append(url)
                self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
Пример #2
0
 def step2(self, params):
     key = params.customized['query']
     query = Common.urldec(key)
     soup = BeautifulSoup(params.content, 'html5lib')
     tbody = soup.select('.search_topic_list > form > table > tbody')
     lis = tbody[-1].select('tr')
     urllist = []
     for li in lis:
         url = li.select_one('.p_title > a').get('href')
         title = li.select_one('.p_title > a').get_text()
         curtime = li.select('td')[3].get_text()
         if TimeUtility.compareNow(TimeUtility.getuniformtime(curtime),
                                   self.querylastdays):
             if Common.checktitle(query, title):
                 urllist.append(url)
             else:
                 Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTITLE)
         else:
             Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME)
     if len(urllist) > 0:
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
Пример #3
0
 def step_last(self, params):
     urllist = []
     info = params.customized['query']
     soup = BeautifulSoup(params.content, 'html5lib')
     divs = soup.select('#results > .result')
     for div in divs:
         publish = div.select_one('.c-summary-1').get_text()
         title = div.select_one('h3 > a').get_text().strip()
         url = div.select_one('h3 > a').get('href').strip()
         #url = self.preprocess(href)
         if TimeUtility.compareNow(TimeUtility.getuniformtime(publish),
                                   self.querylastdays):
             if Common.checktitle(Common.trydecode(info),
                                  Common.trydecode(title)):
                 urllist.append(url)
             else:
                 Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTITLE)
         else:
             Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME)
     if len(urllist) > 0:
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
    def step2(self, params):
        """解析每一搜索页面"""
        info = params.customized['info']
        soup = BeautifulSoup(params.content, 'html5lib')
        divs = soup.select('.pbw')

        #divs = soup.select('h3.xs3 > a')
        if not divs:
            return
        urllist = []

        for div in divs:
            tm = div.select('p > span')[0].get_text()
            tm = TimeUtility.getuniformtime(tm)
            geturl = div.select_one('h3.xs3 > a').get('href')
            title = div.select_one('h3.xs3 > a').get_text()
            if not re.search('http://.*com.*', geturl):
                if re.search('(http://.*com).*', params.url):
                    urltemp = re.findall('(http://.*com).*', params.url)[0]
                elif re.search('(http://.*cn).*', params.url):
                    urltemp = re.findall('(http://.*cn).*', params.url)[0]
                elif re.search('(http://.*net).*', params.url):
                    urltemp = re.findall('(http://.*net).*', params.url)[0]
                geturl = urltemp + '/' + geturl
            if re.search('(http.*)&highlight', geturl):
                geturl = re.findall('(http.*)&highlight', geturl)[0]

            Logger.getlogging().info(Common.trydecode(title))
            #to compare time and match title
            if not TimeUtility.compareNow(tm, self.querylastdays):
                Logger.log(geturl, constant.ERRORCODE_WARNNING_NOMATCHTIME)
                continue
            if not Common.checktitle(Common.trydecode(info),
                                     Common.trydecode(title)):
                Logger.log(geturl, constant.ERRORCODE_WARNNING_NOMATCHTITLE)
                continue
            #print geturl
            urllist.append(geturl)
        self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
Пример #5
0
 def step2(self, params):
     key = params.customized['key']
     query = Common.urldec(key)
     soup = BeautifulSoup(params.content, 'html5lib')
     lis = soup.select('.sresult > ul > li')
     urllist = []
     for li in lis:
         url = li.select_one('.stitle > a').get('href')
         title = li.select_one('.stitle').get_text()
         curtime = li.select_one('.scontent').get_text()
         if TimeUtility.compareNow(TimeUtility.getuniformtime(curtime),
                                   self.querylastdays):
             if Common.checktitle(query, title):
                 urllist.append('http://bbs.tgbus.com/' + url)
             else:
                 Logger.log('http://bbs.tgbus.com/' + url,
                            constant.ERRORCODE_WARNNING_NOMATCHTITLE)
         else:
             Logger.log('http://bbs.tgbus.com/' + url,
                        constant.ERRORCODE_WARNNING_NOMATCHTIME)
     if len(urllist) > 0:
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
def compareNow(curtime, days):
    """时间t比现在小多少天"""
    return TimeUtility.compareNow(curtime, days)
Пример #7
0
 def compareNow(self, curtime, days=None):
     if not days:
         days = self.querylastdays
     return TimeUtility.compareNow(TimeUtility.getuniformtime(curtime), days)