Пример #1
0
 def _parseResultsPage(self, pool, url, offset=0, base=False):
     utils.log('[%s] parsing page %s' % (self, url))
     max_offset = 8
     
     if offset < max_offset:
         # optimistically process the next results page before processing this one
         if 'start=' in url:
             start = self.start_re.match(url).groups()[0]
             nexti = int(start) + self.results_per_page
             url2  = url.replace('start=%s' % start, 'start=%d' % nexti)
         else:
             url2  = "%s&start=%d" % (url, self.results_per_page)
         
         pool.spawn(self._parseResultsPage, pool, url2, offset + 1)
     
     try:
         soup = utils.getSoup(url)
     except:
         utils.printException()
         utils.log("[%s] error downloading page %s" % (self, url))
         return
     
     if offset >= max_offset:
         next_pagel = soup.find('a', {'id' : 'pager_page_next'})
         
         if next_pagel is not None:
             href = self.base + next_pagel.get('href')
             pool.spawn(self._parseResultsPage, pool, href, 0)
             time.sleep(0.01)
     
     if base:
         categories = soup.findAll('a', {'id' : self.category_re})
         
         if categories is not None:
             for category in categories:
                 href = self.base + category.get('href')
                 pool.spawn(self._parseResultsPage, pool, href, 0)
             
             # yield so other threads have a chance to start working
             time.sleep(0.01)
     
     separator = '___'
     results   = soup.findAll('div', {'class' : re.compile('businessresult')})
     
     if results is None:
         return
     
     for result in results:
         entity = Entity()
         entity.subcategory = 'restaurant'
         entity.sources.yelp = { }
         
         titlel = result.find('a')
         title  = titlel.getText()
         entity.title = self.title_re.match(title).groups()[0]
         entity.yurl  = self.base + titlel.get('href')
         
         addr   = result.find('address').getText(separator)
         match  = self.address_re.match(addr).groups()
         
         entity.address = "%s, %s" % (match[0], match[1])
         entity.phone = match[2]
         
         rating = result.find('img')
         if rating is not None:
             entity.yrating = float(self.rating_reviews_re.match(rating.get('title')).groups()[0])
         
         reviews = result.find('span', {'class' : 'reviews'})
         if reviews is not None:
             entity.yreviews = int(self.rating_reviews_re.match(reviews.getText()).groups()[0])
         
         key = (entity.title, entity.address)
         if key not in self.seen:
             self.seen.add(key)
             self._output.put(entity)