Пример #1
0
 def _parse_dump(self, filepath):
     f = gzip.open(filepath, 'rb')
     context = iter(etree.iterparse(f, events=("start", "end")))
     
     event, root = context.next()
     offset = 0
     count  = 0
     
     # loop through XML and parse each product element as a book Entity
     for event, elem in context:
         if event == "end" and elem.tag == "product" and elem.get('product_id') is not None:
             root.clear()
             
             if offset < Globals.options.offset:
                 offset += 1
                 continue
             
             if Globals.options.limit and count >= Globals.options.limit:
                 break
             
             try:
                 #assert 'books' == elem.find('.//primary').text.lower()
                 #assert 'USD' == elem.find('price').get('currency')
                 #assert float(elem.find('price').find('retail').text) >= 0.0
                 
                 entity = Entity()
                 entity.subcategory  = "book"
                 
                 entity.title        = elem.get('name')
                 entity.bid          = int(elem.get('product_id'))
                 entity.sku_number   = elem.get('sku_number')
                 entity.image        = elem.find('.//productImage').text
                 
                 entity.author       = elem.find('.//Author').text
                 entity.publisher    = elem.find('.//Publisher').text
                 entity.publish_date = elem.find('.//Publish_Date').text
                 isbn = elem.find('.//ISBN').text
                 
                 if isbn is None or len(isbn) <= 0:
                     continue
                 
                 entity.isbn         = isbn
                 
                 desc = elem.find('description')
                 is_english = 'nglish' in etree.tostring(desc)
                 
                 if not is_english:
                     continue
                 
                 #print etree.tostring(elem, pretty_print=True)
                 #self._globals['books'] = elem
                 #pprint(entity.value)
                 
                 self._output.put(entity)
                 count += 1
                 
                 # give the downstream consumer threads an occasional chance to work
                 if 0 == (count % 512):
                     time.sleep(0.1)
                 
                 parent = elem.getparent()
                 while True:
                     prev = elem.getprevious()
                     if prev is None:
                         break
                     parent.remove(prev)
                 
                 elem.clear()
             except Exception, e:
                 utils.printException()