def _parseResultsPage(self, pool, queue, url, name, base=False): utils.log('[%s] parsing page %s (%s)' % (self, name, url)) try: html = utils.getFile(url) html = html.replace("header>", "div>") soup = BeautifulSoup(html) except: #utils.printException() utils.log("[%s] error downloading page %s (%s)" % (self, name, url)) return # extract and parse the rest of the paginated results if base: page = soup.find('nav').find('span').getText() num_pages = int(self.page_re.match(page).groups()[0]) for i in xrange(2, num_pages + 1): href = '%s&pg=%d' % (url, i) queue.put_nowait((href, name)) results = soup.findAll('section', {'class' : 'CWListing'}) for result in results: entity = Entity() entity.subcategory = "book" entity.awardAnnals = {} entity.title = result.find('h4').find('a').getText().strip() entity.author = result.find('p', {'class' : 'creators'}).getText() key = (entity.title, entity.author) if key in self.seen: continue self.seen.add(key) self._output.put(entity)