Пример #1
0
 def _parseResultsPage(self, pool, queue, url, name, base=False):
     utils.log('[%s] parsing page %s (%s)' % (self, name, url))
     
     try:
         html = utils.getFile(url)
         html = html.replace("header>", "div>") 
         soup = BeautifulSoup(html)
     except:
         #utils.printException()
         utils.log("[%s] error downloading page %s (%s)" % (self, name, url))
         return
     
     # extract and parse the rest of the paginated results
     if base:
         page = soup.find('nav').find('span').getText()
         num_pages = int(self.page_re.match(page).groups()[0])
         
         for i in xrange(2, num_pages + 1):
             href = '%s&pg=%d' % (url, i)
             
             queue.put_nowait((href, name))
     
     results = soup.findAll('section', {'class' : 'CWListing'})
     
     for result in results:
         entity = Entity()
         entity.subcategory = "book"
         entity.awardAnnals = {}
         
         entity.title  = result.find('h4').find('a').getText().strip()
         entity.author = result.find('p', {'class' : 'creators'}).getText()
         
         key = (entity.title, entity.author)
         if key in self.seen:
             continue
         
         self.seen.add(key)
         self._output.put(entity)