Пример #1
0
    def crawl(self, url):
        self.output = open('hao123.crawl%s' % datetime.now().date(), 'w')
        _logger.info('opening hao123 home page: %s' % url)
        html = self.br.open(url).read()
        soup = BeautifulSoup(util.convert_to_utf8(html, "gb2312"))

        for top_tier in soup.findAll('table', monkey='cool'):
            anchor = top_tier.find('a')
            _logger.info('crawling top tier category: %s (%s)'
                         % (anchor.text, anchor['href']))
            self.crawl_first(pbrowser.abs_url(url, anchor['href']))
        self.output.close()
Пример #2
0
    def crawl_second(self, url):
        self._randsleep()
        _logger.debug('openning url:%s' % url)
        html = self.br.open(url).read()
        soup = BeautifulSoup(util.convert_to_utf8(html, "gb2312"))        

        for anchor in soup.findAll('a'):
            try:
                href = anchor['href']
                # Ignore internal links
                if href[:4] != "http" or href.find('hao123.com') != -1:
                    continue
                self.output.write('  %s %s\n' % (href.encode('utf8'), anchor.text.encode('utf8')))
            except Exception, err:
                _logger.error('got error with anchor(%s): %s' % (str(anchor), err))
Пример #3
0
    def crawl(self, url):
        self.owned = set()
        self.output = open('265.crawl%s' % datetime.now().date(), 'w')
        _logger.info('opening 265 home page: %s' % url)
        html = self.br.open(url).read()
        soup = BeautifulSoup(util.convert_to_utf8(html, "gb2312"),
                             fromEncoding='utf-8')

        for anchor in soup.find('div',
                                id="siteCate").find('div',
                                                    'body').findAll('a'):
            _logger.info('crawling top tier category: %s (%s)' %
                         (anchor.text, anchor['href']))
            self.output.write('%s\n' % anchor.text.encode('utf8'))
            self.crawl_layer(pbrowser.abs_url(url, anchor['href']), 1)
        self.output.close()
Пример #4
0
    def _crawl_primary(self, anchor):
        self.output.write(anchor.text.encode('utf-8') + '\n')
        self._randsleep()
        html = self.br.open(anchor['href']).read()
        html = util.convert_to_utf8(html, 'gb2312')
        soup = BeautifulSoup(html)

        seconds = soup.findAll('div', 'dirtit')
        for second in seconds:
            _logger.info('crawling secondary category: (%s)' %
                         second.text.encode('utf-8'))
            try:
                self._crawl_secondary(second)
            except Exception, err:
                _logger.error('secondary(%s) failed: %s' %
                              (second.text.encode('utf-8'), err))
Пример #5
0
    def crawl_first(self, url):
        self._randsleep()
        _logger.info('opening first tier url: %s' % url)
        html = self.br.open(url).read()
        soup = BeautifulSoup(util.convert_to_utf8(html, "gb2312"))
        _logger.info('processing page with title:%s' % soup.title.text)

        tds = soup.findAll('td', 'tdH')
        for td in tds:
            anchor = td.find('a')
            if anchor == None:
                continue
            _logger.info('crawling second tier category: %s (%s)'
                         % (anchor.text, anchor['href']))
            self.output.write('%s\n' % anchor.text.encode('utf-8'))
            self.crawl_second(pbrowser.abs_url(url, anchor['href']))
Пример #6
0
    def crawl(self, url):
        self.output = open('baike.crawl%s' % datetime.now().date(), 'w')
        _logger.info('opening baike home page: %s' % url)
        html = self.br.open(url).read()
        html = util.convert_to_utf8(html, 'gb2312')
        soup = BeautifulSoup(html)

        for item in soup.find('div', id="classList").findAll('h2'):
            anchor = item.find('a')
            _logger.info(
                'crawling primary category: (%s), %s' %
                (anchor.text.encode('utf-8'), anchor['href'].encode('utf-8')))
            try:
                self._crawl_primary(anchor)
            except Exception, err:
                _logger.error('primary category(%s) failed: %s' %
                              (anchor.text.encode('utf-8'), err))
Пример #7
0
    def _crawl_fourth(self, url):
        page = 1
        while True:
            _logger.debug('fourth layer page %d (%s)' % (page, url))
            page += 1
            self._randsleep()
            html = self.br.open(url).read()
            html = util.convert_to_utf8(html, 'gb2312')
            soup = BeautifulSoup(html)

            for td in soup.findAll('td', 'f'):
                self.output.write('      %s\n' %
                                  td.find('a').text.encode('utf-8'))
                self.output.flush()
            try:
                url = soup.find('font',
                                'f9').find(text=u"下一页").parent()['href']
            except:
                break
Пример #8
0
    def crawl_layer(self, url, level):
        self._randsleep()

        prefix = '  ' * level
        _logger.info('opening layer url: %s' % url)
        html = self.br.open(url).read()
        soup = BeautifulSoup(util.convert_to_utf8(html, "gb2312"),
                             fromEncoding="utf-8")
        _logger.info('processing page with title:%s' % soup.title.text)

        # get next level links
        children = {}
        for li in soup.find('div', id='TreeData').findAll('li', 'close'):
            a = li.find('a')
            children[a.text] = a['href']

        # grab links in current page
        for div in soup.find('div', id="BMain").findAll('div', 'subBM'):
            cate = div.find('h3').text
            if cate in self.owned:
                continue

            self.owned.add(cate)
            self.output.write(prefix + '%s\n' % cate.encode('utf8'))
            for li in div.find('ul', 'listUrl').findAll('li'):
                try:
                    a = li.find('a')
                    self.output.write(
                        prefix * 2 + '%s %s\n' %
                        (a['href'].encode('utf8'), a.text.encode('utf8')))
                except Exception, err:
                    _logger.error('error processing anchor(%s): %s' %
                                  (str(li), err))

            # grab links in next level, if any
            if cate in children:
                self.crawl_layer(children[cate], level + 1)