def _parseRestaurantPage(self, pool, region_name, city_name, restaurant_name, href): utils.log("[%s] parsing restaurant '%s.%s.%s' (%s)" % (self, region_name, city_name, restaurant_name, href)) try: soup = utils.getSoup(href) except: utils.printException() utils.log("[%s] error downloading page %s" % (self, href)) return # parse the address for the current restaurant addr = soup.find('div', {'class' : 'address'}) street = addr.find('span', {'class' : 'street'}).getText().strip() geo = addr.find('span', {'class' : 'geo'}).getText().strip() address = "%s, %s" % (street, geo) # add the current restaurant to the output for this crawler entity = Entity() entity.subcategory = "restaurant" entity.title = restaurant_name entity.address = address entity.sources.zagat = { 'zurl' : self.base + href, } #self._globals['soup'] = soup # parse cuisine header = soup.find('div', {'id' : "block-zagat_restaurants-14"}) if header is not None: header = header.find('ul').find('li', {'class' : 'first'}) if header is not None: entity.cuisine = header.getText() # parse website site = soup.find('span', {'class' : 'website'}) if site is not None: site = site.find('a') if site is not None: entity.site = site.get('href') # parse preview image img = soup.find('div', {'id' : 'content'}).find('div', {'class' : 'photo'}) if img is not None: img = img.find('img') if img is not None: entity.image = img.get('src') self._output.put(entity)