def _parseEntity(self, result): entity = Entity() entity.subcategory = 'other' if 'titleNoFormatting' in result: entity.title = result['titleNoFormatting'] if 'addressLines' in result: entity.address = string.joinfields(result['addressLines'], ', ') entity.subtitle = entity.address if 'lat' in result and 'lng' in result: entity.lat = float(result['lat']) entity.lng = float(result['lng']) if 'region' in result: entity.vicinity = result['region'] if 'phoneNumbers' in result: phoneNumbers = result['phoneNumbers'] if len(phoneNumbers) > 0: entity.phone = phoneNumbers[0]['number'] entity.googleLocal = {} entity.titlel = entity.title.lower() return entity
def _parseResultsPage(self, pool, url, offset=0, base=False): utils.log('[%s] parsing page %s' % (self, url)) max_offset = 8 if offset < max_offset: # optimistically process the next results page before processing this one if 'start=' in url: start = self.start_re.match(url).groups()[0] nexti = int(start) + self.results_per_page url2 = url.replace('start=%s' % start, 'start=%d' % nexti) else: url2 = "%s&start=%d" % (url, self.results_per_page) pool.spawn(self._parseResultsPage, pool, url2, offset + 1) try: soup = utils.getSoup(url) except: utils.printException() utils.log("[%s] error downloading page %s" % (self, url)) return if offset >= max_offset: next_pagel = soup.find('a', {'id' : 'pager_page_next'}) if next_pagel is not None: href = self.base + next_pagel.get('href') pool.spawn(self._parseResultsPage, pool, href, 0) time.sleep(0.01) if base: categories = soup.findAll('a', {'id' : self.category_re}) if categories is not None: for category in categories: href = self.base + category.get('href') pool.spawn(self._parseResultsPage, pool, href, 0) # yield so other threads have a chance to start working time.sleep(0.01) separator = '___' results = soup.findAll('div', {'class' : re.compile('businessresult')}) if results is None: return for result in results: entity = Entity() entity.subcategory = 'restaurant' entity.sources.yelp = { } titlel = result.find('a') title = titlel.getText() entity.title = self.title_re.match(title).groups()[0] entity.yurl = self.base + titlel.get('href') addr = result.find('address').getText(separator) match = self.address_re.match(addr).groups() entity.address = "%s, %s" % (match[0], match[1]) entity.phone = match[2] rating = result.find('img') if rating is not None: entity.yrating = float(self.rating_reviews_re.match(rating.get('title')).groups()[0]) reviews = result.find('span', {'class' : 'reviews'}) if reviews is not None: entity.yreviews = int(self.rating_reviews_re.match(reviews.getText()).groups()[0]) key = (entity.title, entity.address) if key not in self.seen: self.seen.add(key) self._output.put(entity)