def _parseEntity(self, result): entity = Entity() entity.subcategory = 'other' if 'titleNoFormatting' in result: entity.title = result['titleNoFormatting'] if 'addressLines' in result: entity.address = string.joinfields(result['addressLines'], ', ') entity.subtitle = entity.address if 'lat' in result and 'lng' in result: entity.lat = float(result['lat']) entity.lng = float(result['lng']) if 'region' in result: entity.vicinity = result['region'] if 'phoneNumbers' in result: phoneNumbers = result['phoneNumbers'] if len(phoneNumbers) > 0: entity.phone = phoneNumbers[0]['number'] entity.googleLocal = {} entity.titlel = entity.title.lower() return entity
def _parseEntity(self, sheet, index, numEntities): if numEntities > 100 and ((index - 1) % (numEntities / 100)) == 0: utils.log("[%s] done parsing %s" % \ (self.NAME, utils.getStatusStr(index - 1 - Globals.options.offset, numEntities))) time.sleep(0.1) row = sheet.row_values(index) entity = Entity() entity.subcategory = "restaurant" entity.title = row[1] entity.address = row[3] + ', ' + \ row[4] + ', ' + \ row[5] + ' ' + \ row[6] entity.openTable = { 'rid' : int(row[8]), 'reserveURL' : row[9], 'countryID' : row[10], 'metroName' : row[0], 'neighborhoodName' : row[2], } # don't make external calls to opentable in test mode if not Globals.options.test: result = OpenTableParser.parseEntity(entity) if result is None: return if entity is not None: #print entity.title #from pprint import pprint #pprint(entity.getDataAsDict()) self._output.put(entity)
def _parseRestaurantPage(self, pool, region_name, area_name, restaurant_name, href): utils.log("[%s] parsing restaurant '%s.%s.%s' (%s)" % (self, region_name, area_name, restaurant_name, href)) try: soup = utils.getSoup(href) except: utils.log("[%s] error downloading page %s" % (self, href)) return # parse the address for the current restaurant addr = soup.find('span', {'class' : 'adr'}) street = addr.find('span', {'class' : 'street-address'}).getText().strip() locality = addr.find('span', {'class' : 'locality'}).getText().strip() region = addr.find('span', {'class' : 'region'}).getText().strip() zipcode = addr.find('a', {'class' : re.compile('postal-code')}).getText().strip() address = "%s, %s, %s %s" % (street, locality, region, zipcode) # add the current restaurant to the output for this crawler entity = Entity() entity.subcategory = "restaurant" entity.title = restaurant_name entity.address = address entity.sources.urbanspoon = { 'uurl' : href, } self._output.put(entity)
def _parseResultsPage(self, pool, href): try: soup = utils.getSoup(href) except urllib2.HTTPError: utils.log("[%s] error parsing page %s" % (self, href)) return results = soup.find('div', { 'name' : 'LocationDirectory' }).findAll('h3') for result in results: try: name = result.find('a').getText().strip() except Exception: continue try: raw_address = result.findNext('span', { 'class' : 'address'}).getText() street = raw_address[0:raw_address.find('(')].strip() locale = raw_address[raw_address.find(')')+1:raw_address.find('CA')+2].strip() addr = '{0}, {1}'.format(street, locale) except Exception: addr = '' utils.log("[%s] error parsing %s (%s)" % (self, addr, href)) continue if addr == '': continue if name == '': continue if (name, addr) in self._seen: continue self._seen.add((name, addr)) entity = Entity() entity.subcategory = "bar" entity.title = name entity.address = addr entity.sources.sfweekly = { } self._output.put(entity) # try the next page try: pagination = soup.find('span', { 'class' : 'Pagination' }).getText() if 'Next' in pagination: pagination = soup.find('span', { 'class' : 'Pagination' }) href_get = pagination.find('span', { 'class' : 'PaginationSelected' }).findNext('a').get('href') next_page = '{0}{1}'.format('http://www.sfweekly.com', href_get) else: next_page = '' except Exception: next_page = '' if next_page != '': pool.spawn(self._parseResultsPage, pool, next_page)
def _parseResultsPage(self, pool, href): try: soup = utils.getSoup(href) except urllib2.HTTPError: utils.log("[%s] error parsing page %s" % (self, href)) return results = soup.find('div', { 'class' : 'split-right-column' }).findAll('div', { 'class' : 'clear' }) for result in results: try: name = result.findNext('div').find('h2').find('a').getText().strip() except Exception: continue try: street = result.findNext('div').find('address').getText() locale = '{0}, {1}'.format('Los Angeles', 'CA') addr = '{0}, {1}'.format(street, locale) except Exception: addr = '' continue if addr == '': continue if name == '': continue if (name, addr) in self._seen: continue self._seen.add((name, addr)) entity = Entity() if 'Bars' in result.findNext('span').getText(): entity.subcategory = "bar" else: entity.subcategory = "restaurant" entity.title = name entity.address = addr entity.sources.timeout_la = { } self._output.put(entity) # try the next page try: href_get = soup.find('div', { 'class' : 'next' }).find('a').get('href') next_page = '{0}{1}'.format('http://www.timeout.com', href_get) except Exception: next_page = '' if next_page != '': pool.spawn(self._parseResultsPage, pool, next_page)
def _parseResultsPage(self, pool, href): try: soup = utils.getSoup(href) except urllib2.HTTPError: utils.log("[%s] error parsing page %s" % (self, href)) return results = soup.findAll('h3') for result in results: try: name = result.find('span', { 'style' : 'cursor:pointer;' }).getText().strip() except AttributeError: utils.log("[%s] error parsing %s (%s)" % (self, name, href)) return try: address1 = result.findNext('span', { 'class' : 'addresslinecaps' }).getText().strip() if '(' in address1: # sf mag does not provide any city, state or zip information, # so inserting basic universal info manually. addr = '{0}, {1}'.format(address1.split('(')[0].strip(), 'San Francisco, CA') else: addr = '{0}, {1}'.format(address1, 'San Francisco, CA') except AttributeError: utils.log("[%s] error parsing %s (%s)" % (self, addr, href)) return entity = Entity() entity.subcategory = "restaurant" entity.title = name entity.address = addr entity.sources.sfmag = { } self._output.put(entity) #locate total pages and compare against current page num to determine if we should iterate again try: total_pages = soup.find('span', { 'class' : 'last' }).findPrevious('span').getText().strip() except AttributeError: # crawling of pages is done return index = href.find('&page=') end = href.find('&keyword') page = href[index+6:end] if int(page) <= int(total_pages)-1: next_page = href.replace('&page=' + str(page), '&page=' + str(int(page)+1)) pool.spawn(self._parseResultsPage, pool, next_page) else: return time.sleep(0.01)
def _parseResultsPage(self, pool, href): try: soup = utils.getSoup(href) except urllib2.HTTPError: utils.log("[%s] error parsing page %s" % (self, href)) return results = soup.find('div', { 'class' : 'search_results' }).findAll('div', { 'class' : 'restaurant'}) for result in results: try: name = result.find('h3').find('a').getText().strip() except Exception: continue try: street = result.find('br').previousSibling.strip() locale = '{0}, {1}'.format(result.find('br').nextSibling.strip(), 'CA') addr = '{0}, {1}'.format(street, locale) except Exception: addr = '' utils.log("[%s] error parsing %s (%s)" % (self, addr, href)) continue if addr == '': continue if name == '': continue if (name, addr) in self._seen: continue self._seen.add((name, addr)) entity = Entity() entity.subcategory = "restaurant" entity.title = name entity.address = addr entity.sources.sfgate = { } self._output.put(entity) # try the next page try: href_get = soup.find('li', { 'class' : 'next' }).find('a').get('href') next_page = '{0}{1}'.format('http://www.sfgate.com', href_get) except Exception: next_page = '' if next_page != '': pool.spawn(self._parseResultsPage, pool, next_page)
def _parseRestaurantPage(self, pool, region_name, city_name, restaurant_name, href): utils.log("[%s] parsing restaurant '%s.%s.%s' (%s)" % (self, region_name, city_name, restaurant_name, href)) try: soup = utils.getSoup(href) except: utils.printException() utils.log("[%s] error downloading page %s" % (self, href)) return # parse the address for the current restaurant addr = soup.find('div', {'class' : 'address'}) street = addr.find('span', {'class' : 'street'}).getText().strip() geo = addr.find('span', {'class' : 'geo'}).getText().strip() address = "%s, %s" % (street, geo) # add the current restaurant to the output for this crawler entity = Entity() entity.subcategory = "restaurant" entity.title = restaurant_name entity.address = address entity.sources.zagat = { 'zurl' : self.base + href, } #self._globals['soup'] = soup # parse cuisine header = soup.find('div', {'id' : "block-zagat_restaurants-14"}) if header is not None: header = header.find('ul').find('li', {'class' : 'first'}) if header is not None: entity.cuisine = header.getText() # parse website site = soup.find('span', {'class' : 'website'}) if site is not None: site = site.find('a') if site is not None: entity.site = site.get('href') # parse preview image img = soup.find('div', {'id' : 'content'}).find('div', {'class' : 'photo'}) if img is not None: img = img.find('img') if img is not None: entity.image = img.get('src') self._output.put(entity)
def _parseResultsPage(self, queue, url, name, depth): try: soup = utils.getSoup(url) except: utils.printException() utils.log("[%s] error downloading page %s (%s)" % (self, name, url)) return if depth < self.max_depth: # extract and parse subcategory pages category_ul = soup.find('ul', {'id' : 'zg_browseRoot'}) if category_ul is not None: while True: temp_ul = category_ul.find('ul') if temp_ul is None: break else: category_ul = temp_ul categories = category_ul.findAll('a') for category in categories: href = category.get('href') name = utils.normalize(category.getText()) queue.put_nowait((href, name, depth + 1)) self._globals['books'] = soup rss_link = soup.find('div', {'id' : 'zg_rssLinks'}) if rss_link is None: return rss_link = rss_link.findAll('a')[1].get('href') if rss_link in self.seen: return self.seen.add(rss_link) entity = Entity() entity.title = rss_link entity.subcategory = 'book' self._output.put(entity)
def _parseRestaurantPage(self, pool, queue, url, name, base=False): utils.log('[%s] parsing restaurant page %s (%s)' % (self, name, url)) try: soup = utils.getSoup(url) except: #utils.printException() utils.log("[%s] error downloading page %s (%s)" % (self, name, url)) return content = soup.find('div', { 'id' : 'content'}) if content is None: return entity = Entity() entity.title = content.find('h1').getText() entity.subcategory = "restaurant" entity.seattletimes = {} details = content.find('div', {'id' : 'edbtext'}) desc = details.find('p').getText() if desc is not None: entity.desc = desc details = details.findAll('p', {'class' : 'list'}) address = details[0].renderContents().strip().replace('<br />', '') address = re.sub('[ \n\t]+', ' ', address) entity.address = address if len(details) > 1: site = details[1].get('href') if site is not None: entity.site = site if len(details) > 2: hoursOfOperation = details[2].getText() if hoursOfOperation is not None: entity.hoursOfOperation = hoursOfOperation key = (entity.title, entity.address) if key in self.seen or '(closed)' in entity.title.lower(): return self.seen.add(key) self._output.put(entity)
def _parseDetailPage(self, name, href, subcategory): try: soup = utils.getSoup(href) except urllib2.HTTPError: utils.log("[%s] error parsing %s (%s)" % (self, name, href)) return summ = soup.find('div', {'class' : 'summary-address'}) try: addrp = summ.find('p', {'class' : 'adr'}) street_addr = addrp.find('span', {'class' : 'street-address'}).getText().strip() locality = addrp.find('span', {'class' : 'locality'}).getText().strip() region = addrp.find('span', {'class' : 'region'}).getText().strip() try: postal_code = addrp.find('span', {'class' : 'postal-code'}).getText().strip() except AttributeError: postal_code = "" addr = "%s, %s, %s %s" % (street_addr, locality, region, postal_code) except AttributeError: try: p = summ.find('p').getText() r = re.compile('(.*)nr\. ', re.DOTALL) m = r.match(p) if m is None: r = re.compile('(.*)at[. ]', re.DOTALL) m = r.match(p) addr = m.groups()[0].replace('\n', ' ').strip() except AttributeError: utils.log("[%s] error parsing %s (%s)" % (self, name, href)) return entity = Entity() entity.subcategory = subcategory entity.title = name entity.address = addr entity.nymag = { } self._output.put(entity)
def _parseResultsPage(self, pool, queue, url, name, base=False): utils.log('[%s] parsing page %s (%s)' % (self, name, url)) try: html = utils.getFile(url) html = html.replace("header>", "div>") soup = BeautifulSoup(html) except: #utils.printException() utils.log("[%s] error downloading page %s (%s)" % (self, name, url)) return # extract and parse the rest of the paginated results if base: page = soup.find('nav').find('span').getText() num_pages = int(self.page_re.match(page).groups()[0]) for i in xrange(2, num_pages + 1): href = '%s&pg=%d' % (url, i) queue.put_nowait((href, name)) results = soup.findAll('section', {'class' : 'CWListing'}) for result in results: entity = Entity() entity.subcategory = "book" entity.awardAnnals = {} entity.title = result.find('h4').find('a').getText().strip() entity.author = result.find('p', {'class' : 'creators'}).getText() key = (entity.title, entity.author) if key in self.seen: continue self.seen.add(key) self._output.put(entity)
def _parseListPage(self, pool, queue, url, name, base=False): utils.log('[%s] parsing list page %s (%s)' % (self, name, url)) try: soup = utils.getSoup(url) except: #utils.printException() utils.log("[%s] error downloading page %s (%s)" % (self, name, url)) return results = soup.findAll('td', {'class' : 'summary'}) for result in results: entity = Entity() entity.subcategory = "book" entity.nytimes = {} title = result.find('span', {'class' : 'bookName'}).getText().strip().title() if title.endswith(','): title = title[0:-1] entity.title = title details = result.getText(separator='___') details_match = self.details_re.match(details) if details_match: details_match = details_match.groups() entity.author = details_match[0] entity.publisher = details_match[1] entity.desc = details_match[2] key = (entity.title, entity.author) if key in self.seen: continue self.seen.add(key) self._output.put(entity)
def _parseResultsPage(self, pool, href): try: soup = utils.getSoup(href) except urllib2.HTTPError: utils.log("[%s] error parsing page %s" % (self, href)) return results = soup.find('div', { 'id' : 'searchResults' }).findAll('td', { 'class' : 'start' }) for result in results: try: name = result.find('a').getText().strip() except AttributeError: utils.log("[%s] error parsing %s (%s)" % (self, name, href)) return x = 0 for r in result.findAll('br'): x+=1 if x == 3: try: addr = '{0}, {1}'.format(result.find('a').nextSibling.strip(), result.find('br').nextSibling.strip()) except Exception: utils.log("[%s] error parsing %s (%s)" % (self, addr, href)) return elif x == 4: try: addr = '{0}, {1}'.format(result.contents[3].strip(), result.contents[5].strip()) except Exception: utils.log("[%s] error parsing %s (%s)" % (self, addr, href)) return else: addr = '' if addr == '': continue if 'CLOSED' in name: continue if (name, addr) in self._seen: continue self._seen.add((name, addr)) entity = Entity() entity.subcategory = "restaurant" entity.title = name entity.address = addr entity.sources.bostonmag = { } self._output.put(entity) # try the next page next_page_ending = soup.find('div', { 'class' : 'right_align' }).findAll('a') next_page = '' for n in next_page_ending: if 'Next' in str(n): next_page = href.replace(href[href.find('?'):], n.get('href')) else: pass if next_page != '': pool.spawn(self._parseResultsPage, pool, next_page)
def _parse_dump(self, filename): f = gzip.open(filename, 'rb') context = iter(etree.iterparse(f, events=("start", "end"))) event, root = context.next() nid_re = re.compile('.*\/([0-9]*)$') language_re = re.compile('.*\/languages$') match_genre_re = re.compile('.*\/genres$') match_ratings_re = re.compile('.*\/mpaa_ratings$') match_genre_func = lambda c: re.match(match_genre_re, c.get('scheme')) is not None match_ratings_func = lambda c: re.match(match_ratings_re, c.get('scheme')) is not None match_language_func = lambda c: re.match(language_re, c.get('scheme')) is not None count = 0 bonus_materials = set() # loop through each XML catalog_title element and parse it as a movie Entity for event, elem in context: if event == "end" and elem.tag == "catalog_title": root.clear() try: rating_elem = elem.find('average_rating') if rating_elem is None: continue entity = Entity() nid = elem.find('id').text nid = int(re.match(nid_re, nid).groups()[0]) bonus_materials_elem = elem.find('.//bonus_materials') if bonus_materials_elem is not None: links = map(lambda l: l.get('href'), bonus_materials_elem.findall('link')) for link in links: bonus_material_id = int(re.match(nid_re, link).groups()[0]) #bonus_material_id = re.match(bonus_materials_id_re, link).groups()[0] bonus_materials.add(bonus_material_id) if nid in bonus_materials: continue title = elem.find('title').get('regular') titlel = title.lower() if 'bonus material' in titlel: continue entity.title = title entity.nid = nid entity.desc = elem.find('.//synopsis').text entity.nrating = float(rating_elem.text) categories = elem.findall('category') genres = map(lambda c: c.get('label'), filter(match_genre_func, categories)) entity.ngenres = genres tv = False for genre in genres: if 'tv' in genre.lower(): tv = True break if tv: entity.subcategory = 'tv' else: entity.subcategory = 'movie' ratings = map(lambda c: c.get('label'), filter(match_ratings_func, categories)) if 1 == len(ratings): entity.mpaa_rating = ratings[0] images = elem.find('.//box_art').findall('link') if 3 == len(images) or 4 == len(images): entity.tiny = images[0].get('href') entity.small = images[1].get('href') entity.large = images[2].get('href') if 4 == len(images): entity.hd = images[3].get('href') links = filter(lambda l: 'web page' == l.get('title'), elem.findall('link')) if 1 == len(links): entity.nurl = links[0].get('href') language_elem = elem.find('.//languages_and_audio') language_elems = filter(match_language_func, language_elem.findall('.//category')) release_year_elem = elem.find('release_year') if release_year_elem is not None: entity.original_release_date = release_year_elem.text duration = elem.find('runtime') if duration is not None: entity.track_length = duration.text languages = set() for elem2 in language_elems: languages.add(elem2.get('label').lower()) if 'english' not in languages: continue #utils.log(entity.title) #pprint(entity.getDataAsDict()) """ self._globals['n'] = elem self._globals['s'] = etree.tostring(elem, pretty_print=True) self._globals['e'] = entity break """ self._output.put(entity) count += 1 # give the downstream consumer threads an occasional chance to work if 0 == (count % 512): time.sleep(0.1) elem.clear() except Exception, e: utils.printException() utils.log(elem.find('title').get('regular'))
def _parseResultsPage(self, pool, href): try: soup = utils.getSoup(href) except urllib2.HTTPError: utils.log("[%s] error parsing page %s" % (self, href)) return results = soup.find('ul', { 'id' : 'search_pagination' }).findAll('div', { 'class' : 'listing_item' }) for result in results: try: name = result.find('h2').getText().strip() except AttributeError: utils.log("[%s] error parsing %s (%s)" % (self, name, href)) return try: addr = result.find('span', { 'class' : 'address' }).getText().strip() except AttributeError: utils.log("[%s] error parsing %s (%s)" % (self, addr, href)) return if addr == '': continue if 'CLOSED' in name: continue if addr in self._seen: continue self._seen.add(addr) if name in self._count: if self._count[name] < 3: self._count[name] = self._count[name] + 1 else: continue else: self._count[name] = 1 entity = Entity() entity.subcategory = "restaurant" entity.title = name entity.address = addr entity.sources.latimes = { } self._output.put(entity) #try the next page try: next_page = soup.find('a', {'class': 'next_page'}).get("href") if next_page != '': next_page_url = "{0}{1}".format('http://findlocal.latimes.com', next_page) pool.spawn(self._parseResultsPage, pool, next_page_url) except AttributeError: # crawling of pages is done #utils.log("Done crawling: %s" % href) pass
def _parse_series_page(self, name, url): if '**' in name or 'DUPLICATE' in name or name.startswith('.hack'): return utils.log('[%s] parsing page %s (%s)' % (self, name, url)) try: soup = utils.getSoup(url) except: utils.printException() utils.log("[%s] error downloading page %s (%s)" % (self, name, url)) return contents = soup.findAll('div', {'id' : 'content'}) header = contents[0] h1 = header.find('h1') title = h1.getText() h1.extract() entity = Entity() # parse basic show info entity.title = title entity.subcategory = 'tv' desc = header.getText().replace('\r\n', '\n') if len(desc) > 5: entity.desc = desc entity.sources.thetvdb_id = self._id_re.match(url).groups()[0] # parse images images = map(lambda img: img.get('src'), soup.findAll('img', {'class' : 'banner'})) types = [ 'posters', 'fanart', 'graphical', ] for image_type in types: filtered_images = filter(lambda img: image_type in img, images) if len(filtered_images) > 0: entity.image = "%s%s" % (self.base, filtered_images[0]) break info = contents[1].find('table').find('table') rows = info.findAll('tr') # parse detailed show info info_map = { 0 : 'original_release_date', 3 : 'air_time', 4 : 'network_name', 5 : 'genre', } for k, k2 in info_map.iteritems(): try: value = rows[k].findAll('td')[1].getText() if len(value) > 0: entity[k2] = value except: utils.printException() pass # parse cast try: actors = "%s%s" % (self.base, contents[-1].findAll('a')[-1].get('href')) actors_soup = utils.getSoup(actors) infotables = actors_soup.findAll('table', {'class' : 'infotable'}) cast = [] for infotable in infotables: text = infotable.find('td').getText(separator='___') match = self._actor_re.match(text) if match is not None: groups = match.groups() cast.append('%s as %s' % (groups[0].strip(), groups[1].strip())) # TODO: record actor images if len(cast) > 0: entity.cast = ', '.join(cast) except: pass # parse seasons try: seasons = "%s%s" % (self.base, contents[2].findAll('a')[-1].get('href')) seasons_soup = utils.getSoup(seasons) rows = seasons_soup.find('table', {'id' : 'listtable'}).findAll('tr')[1:] highest_season = -1 earliest = None latest = None # each row is an episode; loop through each episode, recording the # earliest and latest air date for the show overall and the number # of seasons the show ran for. for row in rows: tds = row.findAll('td') episode = tds[0].getText() match = self._season_re.match(episode) if match is not None: groups = match.groups() season = int(groups[0]) episode = int(groups[1]) if season > highest_season: highest_season = season date = tds[2].getText() match = self._date_re.match(date) if match is not None: year, month, day = match.groups() date = datetime(year=int(year), month=int(month), day=int(day)) if earliest is None or date < earliest: earliest = date if latest is None or date > latest: latest = date if highest_season > 0: entity.num_seasons = highest_season if earliest is not None: entity.earliest_air_date = earliest if latest is not None: entity.latest_air_date = latest except: utils.printException() entity2 = self._thetvdb.lookup(entity.sources.thetvdb_id) if entity2 is not None: if entity2.mpaa_rating is not None: entity.mpaa_rating = entity2.mpaa_rating if entity2.imdb_id is not None: entity.imdb_id = entity2.imdb_id self._output.put(entity)
def _parse_dump(self, filepath): f = gzip.open(filepath, 'rb') context = iter(etree.iterparse(f, events=("start", "end"))) event, root = context.next() offset = 0 count = 0 # loop through XML and parse each product element as a book Entity for event, elem in context: if event == "end" and elem.tag == "product" and elem.get('product_id') is not None: root.clear() if offset < Globals.options.offset: offset += 1 continue if Globals.options.limit and count >= Globals.options.limit: break try: #assert 'books' == elem.find('.//primary').text.lower() #assert 'USD' == elem.find('price').get('currency') #assert float(elem.find('price').find('retail').text) >= 0.0 entity = Entity() entity.subcategory = "book" entity.title = elem.get('name') entity.bid = int(elem.get('product_id')) entity.sku_number = elem.get('sku_number') entity.image = elem.find('.//productImage').text entity.author = elem.find('.//Author').text entity.publisher = elem.find('.//Publisher').text entity.publish_date = elem.find('.//Publish_Date').text isbn = elem.find('.//ISBN').text if isbn is None or len(isbn) <= 0: continue entity.isbn = isbn desc = elem.find('description') is_english = 'nglish' in etree.tostring(desc) if not is_english: continue #print etree.tostring(elem, pretty_print=True) #self._globals['books'] = elem #pprint(entity.value) self._output.put(entity) count += 1 # give the downstream consumer threads an occasional chance to work if 0 == (count % 512): time.sleep(0.1) parent = elem.getparent() while True: prev = elem.getprevious() if prev is None: break parent.remove(prev) elem.clear() except Exception, e: utils.printException()
def _parseResultsPage(self, pool, href): try: soup = utils.getSoup(href) except urllib2.HTTPError: utils.log("[%s] error parsing page %s" % (self, href)) return results = soup.find('div', { 'class' : 'searchresults' }).findAll('div', { 'class' : 'fs1-sans' }) for result in results: if 'Price' in result.getText(): continue if 'Kid' in result.getText(): continue if 'Other' in result.getText(): continue if 'Wheelchair' in result.getText(): continue if 'Cuisines' in result.getText(): continue if 'Rating' in result.getText(): continue if 'Latest' in result.getText(): continue try: name = result.find('strong').getText().strip() except Exception: continue try: addr = '{0} {1}, {2}, {3}'.format(result.find('span').getText(), result.find('span').findNext('span').getText(), result.find('span').findNext('span').findNext('span').getText(), result.find('span').findNext('span').findNext('span').findNext('span').getText()) except Exception: addr = '' utils.log("[%s] error parsing %s (%s)" % (self, addr, href)) continue if addr == '': continue if name == '': continue if (name, addr) in self._seen: continue self._seen.add((name, addr)) entity = Entity() entity.subcategory = "restaurant" entity.title = name entity.address = addr entity.sources.washmag = { } self._output.put(entity) return
def _parseResultsPage(self, pool, href): try: soup = utils.getSoup(href) except urllib2.HTTPError: utils.log("[%s] error parsing page %s" % (self, href)) return results = soup.find('td', { 'id' : 'search-results' }).findAll('tr') for result in results: try: name = result.find('td', { 'class' : 'business-name' }).find('a').getText().strip() except Exception: continue try: result.find('td', { 'class' : 'contact' }).find('br').previousSibling.strip() result.find('td', { 'class' : 'contact' }).find('br').nextSibling.strip() addr = '{0}, {1}'.format(result.find('td', { 'class' : 'contact' }).find('br').previousSibling.strip(), result.find('td', { 'class' : 'contact' }).find('br').nextSibling.strip()) except Exception: addr = '' utils.log("[%s] error parsing %s (%s)" % (self, addr, href)) continue if 'OPENING SOON' in result.find('td', { 'class' : 'categories' }).getText(): continue if addr == '': continue if name == '': continue if 'CLOSED' in name: continue if (name, addr) in self._seen: continue self._seen.add((name, addr)) entity = Entity() entity.subcategory = "restaurant" entity.title = name entity.address = addr entity.sources.chicagomag = { } self._output.put(entity) # try the next page next_page_all= soup.find('div', { 'id' : 'pager' }).findAll('a') next_page = '' for n in next_page_all: if 'Next' in n.getText(): next_page = n.get('href') else: pass if next_page != '': pool.spawn(self._parseResultsPage, pool, next_page)
def _parseResultsPage(self, pool, url, offset=0, base=False): utils.log('[%s] parsing page %s' % (self, url)) max_offset = 8 if offset < max_offset: # optimistically process the next results page before processing this one if 'start=' in url: start = self.start_re.match(url).groups()[0] nexti = int(start) + self.results_per_page url2 = url.replace('start=%s' % start, 'start=%d' % nexti) else: url2 = "%s&start=%d" % (url, self.results_per_page) pool.spawn(self._parseResultsPage, pool, url2, offset + 1) try: soup = utils.getSoup(url) except: utils.printException() utils.log("[%s] error downloading page %s" % (self, url)) return if offset >= max_offset: next_pagel = soup.find('a', {'id' : 'pager_page_next'}) if next_pagel is not None: href = self.base + next_pagel.get('href') pool.spawn(self._parseResultsPage, pool, href, 0) time.sleep(0.01) if base: categories = soup.findAll('a', {'id' : self.category_re}) if categories is not None: for category in categories: href = self.base + category.get('href') pool.spawn(self._parseResultsPage, pool, href, 0) # yield so other threads have a chance to start working time.sleep(0.01) separator = '___' results = soup.findAll('div', {'class' : re.compile('businessresult')}) if results is None: return for result in results: entity = Entity() entity.subcategory = 'restaurant' entity.sources.yelp = { } titlel = result.find('a') title = titlel.getText() entity.title = self.title_re.match(title).groups()[0] entity.yurl = self.base + titlel.get('href') addr = result.find('address').getText(separator) match = self.address_re.match(addr).groups() entity.address = "%s, %s" % (match[0], match[1]) entity.phone = match[2] rating = result.find('img') if rating is not None: entity.yrating = float(self.rating_reviews_re.match(rating.get('title')).groups()[0]) reviews = result.find('span', {'class' : 'reviews'}) if reviews is not None: entity.yreviews = int(self.rating_reviews_re.match(reviews.getText()).groups()[0]) key = (entity.title, entity.address) if key not in self.seen: self.seen.add(key) self._output.put(entity)