Пример #1
0
 def _parseListPage(self, pool, queue, url, name, base=False):
     utils.log('[%s] parsing list page %s (%s)' % (self, name, url))
     
     try:
         soup = utils.getSoup(url)
     except:
         #utils.printException()
         utils.log("[%s] error downloading page %s (%s)" % (self, name, url))
         return
     
     results = soup.findAll('td', {'class' : 'summary'})
     
     for result in results:
         entity = Entity()
         entity.subcategory = "book"
         entity.nytimes = {}
         
         title = result.find('span', {'class' : 'bookName'}).getText().strip().title()
         if title.endswith(','):
             title = title[0:-1]
         
         entity.title = title
         
         details = result.getText(separator='___')
         details_match = self.details_re.match(details)
         
         if details_match:
             details_match    = details_match.groups()
             entity.author    = details_match[0]
             entity.publisher = details_match[1]
             entity.desc      = details_match[2]
         
         key = (entity.title, entity.author)
         if key in self.seen:
             continue
         
         self.seen.add(key)
         self._output.put(entity)
Пример #2
0
 def _parseResultsPage(self, pool, queue, url, name, base=False):
     utils.log('[%s] parsing page %s (%s)' % (self, name, url))
     
     try:
         html = utils.getFile(url)
         html = html.replace("header>", "div>") 
         soup = BeautifulSoup(html)
     except:
         #utils.printException()
         utils.log("[%s] error downloading page %s (%s)" % (self, name, url))
         return
     
     # extract and parse the rest of the paginated results
     if base:
         page = soup.find('nav').find('span').getText()
         num_pages = int(self.page_re.match(page).groups()[0])
         
         for i in xrange(2, num_pages + 1):
             href = '%s&pg=%d' % (url, i)
             
             queue.put_nowait((href, name))
     
     results = soup.findAll('section', {'class' : 'CWListing'})
     
     for result in results:
         entity = Entity()
         entity.subcategory = "book"
         entity.awardAnnals = {}
         
         entity.title  = result.find('h4').find('a').getText().strip()
         entity.author = result.find('p', {'class' : 'creators'}).getText()
         
         key = (entity.title, entity.author)
         if key in self.seen:
             continue
         
         self.seen.add(key)
         self._output.put(entity)
Пример #3
0
 def _parse_dump(self, filepath):
     f = gzip.open(filepath, 'rb')
     context = iter(etree.iterparse(f, events=("start", "end")))
     
     event, root = context.next()
     offset = 0
     count  = 0
     
     # loop through XML and parse each product element as a book Entity
     for event, elem in context:
         if event == "end" and elem.tag == "product" and elem.get('product_id') is not None:
             root.clear()
             
             if offset < Globals.options.offset:
                 offset += 1
                 continue
             
             if Globals.options.limit and count >= Globals.options.limit:
                 break
             
             try:
                 #assert 'books' == elem.find('.//primary').text.lower()
                 #assert 'USD' == elem.find('price').get('currency')
                 #assert float(elem.find('price').find('retail').text) >= 0.0
                 
                 entity = Entity()
                 entity.subcategory  = "book"
                 
                 entity.title        = elem.get('name')
                 entity.bid          = int(elem.get('product_id'))
                 entity.sku_number   = elem.get('sku_number')
                 entity.image        = elem.find('.//productImage').text
                 
                 entity.author       = elem.find('.//Author').text
                 entity.publisher    = elem.find('.//Publisher').text
                 entity.publish_date = elem.find('.//Publish_Date').text
                 isbn = elem.find('.//ISBN').text
                 
                 if isbn is None or len(isbn) <= 0:
                     continue
                 
                 entity.isbn         = isbn
                 
                 desc = elem.find('description')
                 is_english = 'nglish' in etree.tostring(desc)
                 
                 if not is_english:
                     continue
                 
                 #print etree.tostring(elem, pretty_print=True)
                 #self._globals['books'] = elem
                 #pprint(entity.value)
                 
                 self._output.put(entity)
                 count += 1
                 
                 # give the downstream consumer threads an occasional chance to work
                 if 0 == (count % 512):
                     time.sleep(0.1)
                 
                 parent = elem.getparent()
                 while True:
                     prev = elem.getprevious()
                     if prev is None:
                         break
                     parent.remove(prev)
                 
                 elem.clear()
             except Exception, e:
                 utils.printException()