Globals.options.offset = 0 if Globals.options.limit: Globals.options.limit = max(0, Globals.options.limit - count) pool.join() self._output.put(StopIteration) csvFile.close() utils.log("[%s] finished parsing %d entities" % (self.NAME, count)) def _parseEntity(self, row, count): #utils.log("[%s] parsing entity %d" % (self.NAME, count)) entity = Entity() entity.subcategory = "app" entity.factual = { 'table' : 'iPhone_Apps.csv' } for srcKey, destKey in self._map.iteritems(): if srcKey in row and row[srcKey] and len(row[srcKey]) > 0: entity[destKey] = row[srcKey] self._output.put(entity) from crawler import EntitySources EntitySources.registerSource('factualiPhoneApps', FactualiPhoneAppsDump)
entity.subcategory = 'restaurant' entity.sources.yelp = { } titlel = result.find('a') title = titlel.getText() entity.title = self.title_re.match(title).groups()[0] entity.yurl = self.base + titlel.get('href') addr = result.find('address').getText(separator) match = self.address_re.match(addr).groups() entity.address = "%s, %s" % (match[0], match[1]) entity.phone = match[2] rating = result.find('img') if rating is not None: entity.yrating = float(self.rating_reviews_re.match(rating.get('title')).groups()[0]) reviews = result.find('span', {'class' : 'reviews'}) if reviews is not None: entity.yreviews = int(self.rating_reviews_re.match(reviews.getText()).groups()[0]) key = (entity.title, entity.address) if key not in self.seen: self.seen.add(key) self._output.put(entity) from crawler import EntitySources EntitySources.registerSource('yelp', YelpCrawler)
self._count[name] = self._count[name] + 1 else: continue else: self._count[name] = 1 entity = Entity() entity.subcategory = "restaurant" entity.title = name entity.address = addr entity.sources.latimes = { } self._output.put(entity) #try the next page try: next_page = soup.find('a', {'class': 'next_page'}).get("href") if next_page != '': next_page_url = "{0}{1}".format('http://findlocal.latimes.com', next_page) pool.spawn(self._parseResultsPage, pool, next_page_url) except AttributeError: # crawling of pages is done #utils.log("Done crawling: %s" % href) pass from crawler import EntitySources EntitySources.registerSource('latimes', LATimesCrawler)
entity.subcategory = "restaurant" entity.title = row[1] entity.address = row[3] + ', ' + \ row[4] + ', ' + \ row[5] + ' ' + \ row[6] entity.openTable = { 'rid' : int(row[8]), 'reserveURL' : row[9], 'countryID' : row[10], 'metroName' : row[0], 'neighborhoodName' : row[2], } # don't make external calls to opentable in test mode if not Globals.options.test: result = OpenTableParser.parseEntity(entity) if result is None: return if entity is not None: #print entity.title #from pprint import pprint #pprint(entity.getDataAsDict()) self._output.put(entity) from crawler import EntitySources EntitySources.registerSource('opentable', OpenTableDump)
if (name, addr) in self._seen: continue self._seen.add((name, addr)) entity = Entity() if 'Bars' in result.findNext('span').getText(): entity.subcategory = "bar" else: entity.subcategory = "restaurant" entity.title = name entity.address = addr entity.sources.timeout_sf = { } self._output.put(entity) # try the next page try: href_get = soup.find('div', { 'class' : 'next' }).find('a').get('href') next_page = '{0}{1}'.format('http://www.timeout.com', href_get) except Exception: next_page = '' if next_page != '': pool.spawn(self._parseResultsPage, pool, next_page) from crawler import EntitySources EntitySources.registerSource('timeout_sf', TimeOutSFCrawler)
if (name, addr) in self._seen: continue self._seen.add((name, addr)) entity = Entity() if 'Bars' in result.findNext('span').getText(): entity.subcategory = "bar" else: entity.subcategory = "restaurant" entity.title = name entity.address = addr entity.sources.timeout_la = { } self._output.put(entity) # try the next page try: href_get = soup.find('div', { 'class' : 'next' }).find('a').get('href') next_page = '{0}{1}'.format('http://www.timeout.com', href_get) except Exception: next_page = '' if next_page != '': pool.spawn(self._parseResultsPage, pool, next_page) from crawler import EntitySources EntitySources.registerSource('timeout_la', TimeOutLACrawler)
result.find('span').findNext('span').findNext('span').findNext('span').getText()) except Exception: addr = '' utils.log("[%s] error parsing %s (%s)" % (self, addr, href)) continue if addr == '': continue if name == '': continue if (name, addr) in self._seen: continue self._seen.add((name, addr)) entity = Entity() entity.subcategory = "restaurant" entity.title = name entity.address = addr entity.sources.washmag = { } self._output.put(entity) return from crawler import EntitySources EntitySources.registerSource('washmag', WashMagCrawler)
details = content.find('div', {'id' : 'edbtext'}) desc = details.find('p').getText() if desc is not None: entity.desc = desc details = details.findAll('p', {'class' : 'list'}) address = details[0].renderContents().strip().replace('<br />', '') address = re.sub('[ \n\t]+', ' ', address) entity.address = address if len(details) > 1: site = details[1].get('href') if site is not None: entity.site = site if len(details) > 2: hoursOfOperation = details[2].getText() if hoursOfOperation is not None: entity.hoursOfOperation = hoursOfOperation key = (entity.title, entity.address) if key in self.seen or '(closed)' in entity.title.lower(): return self.seen.add(key) self._output.put(entity) from crawler import EntitySources EntitySources.registerSource('seattletimes', SeattleTimesCrawler)
#self._globals['soup'] = soup # parse cuisine header = soup.find('div', {'id' : "block-zagat_restaurants-14"}) if header is not None: header = header.find('ul').find('li', {'class' : 'first'}) if header is not None: entity.cuisine = header.getText() # parse website site = soup.find('span', {'class' : 'website'}) if site is not None: site = site.find('a') if site is not None: entity.site = site.get('href') # parse preview image img = soup.find('div', {'id' : 'content'}).find('div', {'class' : 'photo'}) if img is not None: img = img.find('img') if img is not None: entity.image = img.get('src') self._output.put(entity) from crawler import EntitySources EntitySources.registerSource('zagat', ZagatCrawler)
continue self._seen.add((name, addr)) entity = Entity() entity.subcategory = "bar" entity.title = name entity.address = addr entity.sources.sfweekly = { } self._output.put(entity) # try the next page try: pagination = soup.find('span', { 'class' : 'Pagination' }).getText() if 'Next' in pagination: pagination = soup.find('span', { 'class' : 'Pagination' }) href_get = pagination.find('span', { 'class' : 'PaginationSelected' }).findNext('a').get('href') next_page = '{0}{1}'.format('http://www.sfweekly.com', href_get) else: next_page = '' except Exception: next_page = '' if next_page != '': pool.spawn(self._parseResultsPage, pool, next_page) from crawler import EntitySources EntitySources.registerSource('sfweekly', SFWeeklyCrawler)
if earliest is None or date < earliest: earliest = date if latest is None or date > latest: latest = date if highest_season > 0: entity.num_seasons = highest_season if earliest is not None: entity.earliest_air_date = earliest if latest is not None: entity.latest_air_date = latest except: utils.printException() entity2 = self._thetvdb.lookup(entity.sources.thetvdb_id) if entity2 is not None: if entity2.mpaa_rating is not None: entity.mpaa_rating = entity2.mpaa_rating if entity2.imdb_id is not None: entity.imdb_id = entity2.imdb_id self._output.put(entity) from crawler import EntitySources EntitySources.registerSource('thetvdb', TheTVDBCrawler)
continue #utils.log(entity.title) #pprint(entity.getDataAsDict()) """ self._globals['n'] = elem self._globals['s'] = etree.tostring(elem, pretty_print=True) self._globals['e'] = entity break """ self._output.put(entity) count += 1 # give the downstream consumer threads an occasional chance to work if 0 == (count % 512): time.sleep(0.1) elem.clear() except Exception, e: utils.printException() utils.log(elem.find('title').get('regular')) f.close() return count from crawler import EntitySources EntitySources.registerSource('netflix', NetflixDump)
if (name, addr) in self._seen: continue self._seen.add((name, addr)) entity = Entity() entity.subcategory = "restaurant" entity.title = name entity.address = addr entity.sources.chicagomag = { } self._output.put(entity) # try the next page next_page_all= soup.find('div', { 'id' : 'pager' }).findAll('a') next_page = '' for n in next_page_all: if 'Next' in n.getText(): next_page = n.get('href') else: pass if next_page != '': pool.spawn(self._parseResultsPage, pool, next_page) from crawler import EntitySources EntitySources.registerSource('chicagomag', ChicagoMagCrawler)
except AttributeError: postal_code = "" addr = "%s, %s, %s %s" % (street_addr, locality, region, postal_code) except AttributeError: try: p = summ.find('p').getText() r = re.compile('(.*)nr\. ', re.DOTALL) m = r.match(p) if m is None: r = re.compile('(.*)at[. ]', re.DOTALL) m = r.match(p) addr = m.groups()[0].replace('\n', ' ').strip() except AttributeError: utils.log("[%s] error parsing %s (%s)" % (self, name, href)) return entity = Entity() entity.subcategory = subcategory entity.title = name entity.address = addr entity.nymag = { } self._output.put(entity) from crawler import EntitySources EntitySources.registerSource('nymag', NYMagCrawler)
categories = category_ul.findAll('a') for category in categories: href = category.get('href') name = utils.normalize(category.getText()) queue.put_nowait((href, name, depth + 1)) self._globals['books'] = soup rss_link = soup.find('div', {'id' : 'zg_rssLinks'}) if rss_link is None: return rss_link = rss_link.findAll('a')[1].get('href') if rss_link in self.seen: return self.seen.add(rss_link) entity = Entity() entity.title = rss_link entity.subcategory = 'book' self._output.put(entity) from crawler import EntitySources EntitySources.registerSource('amazonbestsellerbookfeeds', AmazonBestSellerBookFeeds)
for result in results: entity = Entity() entity.subcategory = "book" entity.nytimes = {} title = result.find('span', {'class' : 'bookName'}).getText().strip().title() if title.endswith(','): title = title[0:-1] entity.title = title details = result.getText(separator='___') details_match = self.details_re.match(details) if details_match: details_match = details_match.groups() entity.author = details_match[0] entity.publisher = details_match[1] entity.desc = details_match[2] key = (entity.title, entity.author) if key in self.seen: continue self.seen.add(key) self._output.put(entity) from crawler import EntitySources EntitySources.registerSource('nytimesbooks', NYTimesBestSellerCrawler)
entity = Entity() entity.subcategory = "restaurant" entity.title = name entity.address = addr entity.sources.sfmag = { } self._output.put(entity) #locate total pages and compare against current page num to determine if we should iterate again try: total_pages = soup.find('span', { 'class' : 'last' }).findPrevious('span').getText().strip() except AttributeError: # crawling of pages is done return index = href.find('&page=') end = href.find('&keyword') page = href[index+6:end] if int(page) <= int(total_pages)-1: next_page = href.replace('&page=' + str(page), '&page=' + str(int(page)+1)) pool.spawn(self._parseResultsPage, pool, next_page) else: return time.sleep(0.01) from crawler import EntitySources EntitySources.registerSource('sfmag', SFMagCrawler)
self._output.put(entity) count += 1 # give the downstream consumer threads an occasional chance to work if 0 == (count % 512): time.sleep(0.1) parent = elem.getparent() while True: prev = elem.getprevious() if prev is None: break parent.remove(prev) elem.clear() except Exception, e: utils.printException() #self._globals['books'] = elem Globals.options.offset -= offset if Globals.options.limit: Globals.options.limit = max(0, Globals.options.limit - count) f.close() return count from crawler import EntitySources EntitySources.registerSource('barnesandnoble', BarnesAndNobleDump)
if (name, addr) in self._seen: continue self._seen.add((name, addr)) entity = Entity() if 'Bars' in result.findNext('span').getText(): entity.subcategory = "bar" else: entity.subcategory = "restaurant" entity.title = name entity.address = addr entity.sources.timeout_mia = { } self._output.put(entity) # try the next page try: href_get = soup.find('div', { 'class' : 'next' }).find('a').get('href') next_page = '{0}{1}'.format('http://www.timeout.com', href_get) except Exception: next_page = '' if next_page != '': pool.spawn(self._parseResultsPage, pool, next_page) from crawler import EntitySources EntitySources.registerSource('timeout_mia', TimeOutMIACrawler)
if (name, addr) in self._seen: continue self._seen.add((name, addr)) entity = Entity() entity.subcategory = "restaurant" entity.title = name entity.address = addr entity.sources.phillymag = { } self._output.put(entity) # try the next page next_page_ending = soup.find('div', { 'class' : 'right_align' }).findAll('a') next_page = '' for n in next_page_ending: if 'Next' in str(n): next_page = href.replace(href[href.find('?'):], n.get('href')) else: pass if next_page != '': pool.spawn(self._parseResultsPage, pool, next_page) from crawler import EntitySources EntitySources.registerSource('phillymag', PhillyMagCrawler)
# extract and parse the rest of the paginated results if base: page = soup.find('nav').find('span').getText() num_pages = int(self.page_re.match(page).groups()[0]) for i in xrange(2, num_pages + 1): href = '%s&pg=%d' % (url, i) queue.put_nowait((href, name)) results = soup.findAll('section', {'class' : 'CWListing'}) for result in results: entity = Entity() entity.subcategory = "book" entity.awardAnnals = {} entity.title = result.find('h4').find('a').getText().strip() entity.author = result.find('p', {'class' : 'creators'}).getText() key = (entity.title, entity.author) if key in self.seen: continue self.seen.add(key) self._output.put(entity) from crawler import EntitySources EntitySources.registerSource('awardannals', AwardAnnalsCrawler)
else: author = author.getText().strip() try: entity.author = self.author_re0.match(author).groups()[0] except AttributeError: try: entity.author = self.author_re1.match(author).groups()[0] except AttributeError: entity.author = author pass # pprint(entity) # self._globals['books'] = entry if asin in self.seen: continue self.seen.add(asin) self._output.put(entity) except: utils.printException() # print soup.prettify() # utils.log("[%s] done parsing feed '%s' (%s)" % (self, data.feed.title, url)) from crawler import EntitySources EntitySources.registerSource("amazonbookfeed", AmazonBookFeed)
self.video_prices.join() def _filter(self, row): video_id = row.video_id # only retain videos which are available for purchase in the US storefront price_info = self.video_prices.get_row('video_id', video_id) if price_info is None: return False return { 'v_retail_price' : price_info['retail_price'], 'v_currency_code' : price_info['currency_code'], 'v_availability_date' : price_info['availability_date'], 'v_sd_price' : price_info['sd_price'], 'v_hq_price' : price_info['hq_price'], 'v_lc_rental_price' : price_info['lc_rental_price'], 'v_sd_rental_price' : price_info['sd_rental_price'], 'v_hd_rental_price' : price_info['hd_rental_price'], } from crawler import EntitySources #EntitySources.registerSource('apple', AppleEPFDumps) EntitySources.registerSource('apple_artists', AppleEPFArtistDump) EntitySources.registerSource('apple_songs', AppleEPFSongDump) EntitySources.registerSource('apple_albums', AppleEPFAlbumDump) EntitySources.registerSource('apple_videos', AppleEPFVideoDump)
if not collapsed: address = FactualUtils.parseAddress(row) if address is not None: entity.address = address for srcKey, destKey in self._map.iteritems(): if srcKey in row and row[srcKey]: entity[destKey] = row[srcKey] self._output.put(entity) from crawler import EntitySources EntitySources.registerSource("factualUSRestaurants", FactualUSRestaurantsDump) to_collapse = { "fuddruckers": False, "d'angelo grilled sandwiches": False, "pizza factory": False, "mexico lindo": False, "penn station east coast subs": False, "dennys": False, "au bon pain": False, "whataburger restaurants": False, "larry's giant subs": False, "firehouse sub": False, "huddle house": False, "lenny's sub shop": False, "crown fried chicken": False,
try: soup = utils.getSoup(href) except: utils.log("[%s] error downloading page %s" % (self, href)) return # parse the address for the current restaurant addr = soup.find('span', {'class' : 'adr'}) street = addr.find('span', {'class' : 'street-address'}).getText().strip() locality = addr.find('span', {'class' : 'locality'}).getText().strip() region = addr.find('span', {'class' : 'region'}).getText().strip() zipcode = addr.find('a', {'class' : re.compile('postal-code')}).getText().strip() address = "%s, %s, %s %s" % (street, locality, region, zipcode) # add the current restaurant to the output for this crawler entity = Entity() entity.subcategory = "restaurant" entity.title = restaurant_name entity.address = address entity.sources.urbanspoon = { 'uurl' : href, } self._output.put(entity) from crawler import EntitySources EntitySources.registerSource('urbanspoon', UrbanspoonCrawler)
if (name, addr) in self._seen: continue self._seen.add((name, addr)) entity = Entity() entity.subcategory = "restaurant" entity.title = name entity.address = addr entity.sources.bostonmag = { } self._output.put(entity) # try the next page next_page_ending = soup.find('div', { 'class' : 'right_align' }).findAll('a') next_page = '' for n in next_page_ending: if 'Next' in str(n): next_page = href.replace(href[href.find('?'):], n.get('href')) else: pass if next_page != '': pool.spawn(self._parseResultsPage, pool, next_page) from crawler import EntitySources EntitySources.registerSource('bostonmag', BostonMagCrawler)
if name == '': continue if (name, addr) in self._seen: continue self._seen.add((name, addr)) entity = Entity() entity.subcategory = "restaurant" entity.title = name entity.address = addr entity.sources.sfgate = { } self._output.put(entity) # try the next page try: href_get = soup.find('li', { 'class' : 'next' }).find('a').get('href') next_page = '{0}{1}'.format('http://www.sfgate.com', href_get) except Exception: next_page = '' if next_page != '': pool.spawn(self._parseResultsPage, pool, next_page) from crawler import EntitySources EntitySources.registerSource('sfgate', SFGateCrawler)