def crawl(self, url): self.output = open('hao123.crawl%s' % datetime.now().date(), 'w') _logger.info('opening hao123 home page: %s' % url) html = self.br.open(url).read() soup = BeautifulSoup(util.convert_to_utf8(html, "gb2312")) for top_tier in soup.findAll('table', monkey='cool'): anchor = top_tier.find('a') _logger.info('crawling top tier category: %s (%s)' % (anchor.text, anchor['href'])) self.crawl_first(pbrowser.abs_url(url, anchor['href'])) self.output.close()
def crawl_second(self, url): self._randsleep() _logger.debug('openning url:%s' % url) html = self.br.open(url).read() soup = BeautifulSoup(util.convert_to_utf8(html, "gb2312")) for anchor in soup.findAll('a'): try: href = anchor['href'] # Ignore internal links if href[:4] != "http" or href.find('hao123.com') != -1: continue self.output.write(' %s %s\n' % (href.encode('utf8'), anchor.text.encode('utf8'))) except Exception, err: _logger.error('got error with anchor(%s): %s' % (str(anchor), err))
def crawl(self, url): self.owned = set() self.output = open('265.crawl%s' % datetime.now().date(), 'w') _logger.info('opening 265 home page: %s' % url) html = self.br.open(url).read() soup = BeautifulSoup(util.convert_to_utf8(html, "gb2312"), fromEncoding='utf-8') for anchor in soup.find('div', id="siteCate").find('div', 'body').findAll('a'): _logger.info('crawling top tier category: %s (%s)' % (anchor.text, anchor['href'])) self.output.write('%s\n' % anchor.text.encode('utf8')) self.crawl_layer(pbrowser.abs_url(url, anchor['href']), 1) self.output.close()
def _crawl_primary(self, anchor): self.output.write(anchor.text.encode('utf-8') + '\n') self._randsleep() html = self.br.open(anchor['href']).read() html = util.convert_to_utf8(html, 'gb2312') soup = BeautifulSoup(html) seconds = soup.findAll('div', 'dirtit') for second in seconds: _logger.info('crawling secondary category: (%s)' % second.text.encode('utf-8')) try: self._crawl_secondary(second) except Exception, err: _logger.error('secondary(%s) failed: %s' % (second.text.encode('utf-8'), err))
def crawl_first(self, url): self._randsleep() _logger.info('opening first tier url: %s' % url) html = self.br.open(url).read() soup = BeautifulSoup(util.convert_to_utf8(html, "gb2312")) _logger.info('processing page with title:%s' % soup.title.text) tds = soup.findAll('td', 'tdH') for td in tds: anchor = td.find('a') if anchor == None: continue _logger.info('crawling second tier category: %s (%s)' % (anchor.text, anchor['href'])) self.output.write('%s\n' % anchor.text.encode('utf-8')) self.crawl_second(pbrowser.abs_url(url, anchor['href']))
def crawl(self, url): self.output = open('baike.crawl%s' % datetime.now().date(), 'w') _logger.info('opening baike home page: %s' % url) html = self.br.open(url).read() html = util.convert_to_utf8(html, 'gb2312') soup = BeautifulSoup(html) for item in soup.find('div', id="classList").findAll('h2'): anchor = item.find('a') _logger.info( 'crawling primary category: (%s), %s' % (anchor.text.encode('utf-8'), anchor['href'].encode('utf-8'))) try: self._crawl_primary(anchor) except Exception, err: _logger.error('primary category(%s) failed: %s' % (anchor.text.encode('utf-8'), err))
def _crawl_fourth(self, url): page = 1 while True: _logger.debug('fourth layer page %d (%s)' % (page, url)) page += 1 self._randsleep() html = self.br.open(url).read() html = util.convert_to_utf8(html, 'gb2312') soup = BeautifulSoup(html) for td in soup.findAll('td', 'f'): self.output.write(' %s\n' % td.find('a').text.encode('utf-8')) self.output.flush() try: url = soup.find('font', 'f9').find(text=u"下一页").parent()['href'] except: break
def crawl_layer(self, url, level): self._randsleep() prefix = ' ' * level _logger.info('opening layer url: %s' % url) html = self.br.open(url).read() soup = BeautifulSoup(util.convert_to_utf8(html, "gb2312"), fromEncoding="utf-8") _logger.info('processing page with title:%s' % soup.title.text) # get next level links children = {} for li in soup.find('div', id='TreeData').findAll('li', 'close'): a = li.find('a') children[a.text] = a['href'] # grab links in current page for div in soup.find('div', id="BMain").findAll('div', 'subBM'): cate = div.find('h3').text if cate in self.owned: continue self.owned.add(cate) self.output.write(prefix + '%s\n' % cate.encode('utf8')) for li in div.find('ul', 'listUrl').findAll('li'): try: a = li.find('a') self.output.write( prefix * 2 + '%s %s\n' % (a['href'].encode('utf8'), a.text.encode('utf8'))) except Exception, err: _logger.error('error processing anchor(%s): %s' % (str(li), err)) # grab links in next level, if any if cate in children: self.crawl_layer(children[cate], level + 1)