Пример #1
0
 def _parseRestaurantPage(self, pool, region_name, area_name, restaurant_name, href):
     utils.log("[%s] parsing restaurant '%s.%s.%s' (%s)" % (self, region_name, area_name, restaurant_name, href))
     
     try:
         soup = utils.getSoup(href)
     except:
         utils.log("[%s] error downloading page %s" % (self, href))
         return
     
     # parse the address for the current restaurant
     addr     = soup.find('span', {'class' : 'adr'})
     street   = addr.find('span', {'class' : 'street-address'}).getText().strip()
     locality = addr.find('span', {'class' : 'locality'}).getText().strip()
     region   = addr.find('span', {'class' : 'region'}).getText().strip()
     zipcode  = addr.find('a', {'class' : re.compile('postal-code')}).getText().strip()
     
     address = "%s, %s, %s %s" % (street, locality, region, zipcode)
     
     # add the current restaurant to the output for this crawler
     entity = Entity()
     entity.subcategory = "restaurant"
     entity.title   = restaurant_name
     entity.address = address
     entity.sources.urbanspoon = {
         'uurl' : href, 
     }
     
     self._output.put(entity)
Пример #2
0
 def _parseEntity(self, sheet, index, numEntities):
     if numEntities > 100 and ((index - 1) % (numEntities / 100)) == 0:
         utils.log("[%s] done parsing %s" % \
             (self.NAME, utils.getStatusStr(index - 1 - Globals.options.offset, numEntities)))
         time.sleep(0.1)
     
     row = sheet.row_values(index)
     
     entity = Entity()
     entity.subcategory = "restaurant"
     entity.title = row[1]
     entity.address = row[3] + ', ' + \
                      row[4] + ', ' + \
                      row[5] + ' ' + \
                      row[6]
     
     entity.openTable = {
         'rid' : int(row[8]), 
         'reserveURL' : row[9], 
         'countryID' : row[10], 
         'metroName' : row[0], 
         'neighborhoodName' : row[2], 
     }
     
     # don't make external calls to opentable in test mode
     if not Globals.options.test:
         result = OpenTableParser.parseEntity(entity)
         if result is None:
             return
     
     if entity is not None:
         #print entity.title
         #from pprint import pprint
         #pprint(entity.getDataAsDict())
         self._output.put(entity)
Пример #3
0
 def _parseEntity(self, result):
     entity = Entity()
     entity.subcategory = 'other'
     
     if 'titleNoFormatting' in result:
         entity.title = result['titleNoFormatting']
     
     if 'addressLines' in result:
         entity.address = string.joinfields(result['addressLines'], ', ')
         entity.subtitle = entity.address
     
     if 'lat' in result and 'lng' in result:
         entity.lat = float(result['lat'])
         entity.lng = float(result['lng'])
     
     if 'region' in result:
         entity.vicinity = result['region']
     
     if 'phoneNumbers' in result:
         phoneNumbers = result['phoneNumbers']
         
         if len(phoneNumbers) > 0:
             entity.phone = phoneNumbers[0]['number']
     
     entity.googleLocal = {}
     entity.titlel = entity.title.lower()
     
     return entity
Пример #4
0
    def _parseResultsPage(self, pool, href):
        try:
            soup = utils.getSoup(href)
        except urllib2.HTTPError:
            utils.log("[%s] error parsing page %s" % (self, href))
            return
        
        results = soup.find('div', { 'name' : 'LocationDirectory' }).findAll('h3')
        
        for result in results:
            
            try:
                name = result.find('a').getText().strip()
            except Exception:
                continue

            try:
                raw_address = result.findNext('span', { 'class' : 'address'}).getText()
                street = raw_address[0:raw_address.find('(')].strip()
                locale = raw_address[raw_address.find(')')+1:raw_address.find('CA')+2].strip()
                addr = '{0}, {1}'.format(street, locale)
            except Exception:
                addr = ''
                utils.log("[%s] error parsing %s (%s)" % (self, addr, href))
                continue 
            
            if addr == '':
                continue
                
            if name == '':
                continue 
            
            if (name, addr) in self._seen:
                continue
            
            self._seen.add((name, addr))
            
            entity = Entity()
            entity.subcategory = "bar"
            entity.title   = name
            entity.address = addr
            entity.sources.sfweekly = { }
            
            self._output.put(entity)
        
        # try the next page
        try: 
            pagination = soup.find('span', { 'class' : 'Pagination' }).getText()
            if 'Next' in pagination:
                pagination = soup.find('span', { 'class' : 'Pagination' })
                href_get = pagination.find('span', { 'class' : 'PaginationSelected' }).findNext('a').get('href')
                next_page = '{0}{1}'.format('http://www.sfweekly.com', href_get)
            else: 
                next_page = '' 
        except Exception: 
            next_page = ''
        
        if next_page != '':
            pool.spawn(self._parseResultsPage, pool, next_page)
Пример #5
0
    def _parseResultsPage(self, pool, href):
        try:
            soup = utils.getSoup(href)
        except urllib2.HTTPError:
            utils.log("[%s] error parsing page %s" % (self, href))
            return
        
        results = soup.find('div', { 'class' : 'split-right-column' }).findAll('div', { 'class' : 'clear' })
        
        for result in results:
            
            try:
                name = result.findNext('div').find('h2').find('a').getText().strip()
            except Exception:
                continue

            try:
                street = result.findNext('div').find('address').getText()
                locale = '{0}, {1}'.format('Los Angeles', 'CA')
                addr = '{0}, {1}'.format(street, locale)
            except Exception:
                addr = ''
                continue 
            
            if addr == '':
                continue
                
            if name == '':
                continue 
            
            if (name, addr) in self._seen:
                continue
            
            self._seen.add((name, addr))
            
            entity = Entity()
            if 'Bars' in result.findNext('span').getText():
                entity.subcategory = "bar"
            else:
                entity.subcategory = "restaurant"
            
            entity.title   = name
            entity.address = addr
            entity.sources.timeout_la = { }
            
            self._output.put(entity)
        
        # try the next page
        try: 
            href_get = soup.find('div', { 'class' : 'next' }).find('a').get('href')
            next_page = '{0}{1}'.format('http://www.timeout.com', href_get)
        except Exception: 
            next_page = ''
        
        if next_page != '':
            pool.spawn(self._parseResultsPage, pool, next_page)
Пример #6
0
 def _parseResultsPage(self, pool, href):
     try:
         soup = utils.getSoup(href)
     except urllib2.HTTPError:
         utils.log("[%s] error parsing page %s" % (self, href))
         return
     
     results = soup.findAll('h3')
     for result in results:
         try:
             name = result.find('span', { 'style' : 'cursor:pointer;' }).getText().strip()
         except AttributeError:
             utils.log("[%s] error parsing %s (%s)" % (self, name, href))
             return
         
         try:
             address1 = result.findNext('span', { 'class' : 'addresslinecaps' }).getText().strip()
             if '(' in address1:
                 # sf mag does not provide any city, state or zip information, 
                 # so inserting basic universal info manually.
                 addr = '{0}, {1}'.format(address1.split('(')[0].strip(), 'San Francisco, CA')
             else: 
                 addr = '{0}, {1}'.format(address1, 'San Francisco, CA') 
         except AttributeError:
             utils.log("[%s] error parsing %s (%s)" % (self, addr, href))
             return
         
         entity = Entity()
         entity.subcategory = "restaurant"
         entity.title   = name
         entity.address = addr
         entity.sources.sfmag = { }
         
         self._output.put(entity)
     
     #locate total pages and compare against current page num to determine if we should iterate again
     try:
         total_pages = soup.find('span', { 'class' : 'last' }).findPrevious('span').getText().strip()
     except AttributeError:
         # crawling of pages is done
         return
     
     index = href.find('&page=')
     end = href.find('&keyword')
     page = href[index+6:end]
     
     if int(page) <= int(total_pages)-1:
         next_page = href.replace('&page=' + str(page), '&page=' + str(int(page)+1))
         pool.spawn(self._parseResultsPage, pool, next_page)
     else:
         return
     
     time.sleep(0.01)
Пример #7
0
    def _parseResultsPage(self, pool, href):
        try:
            soup = utils.getSoup(href)
        except urllib2.HTTPError:
            utils.log("[%s] error parsing page %s" % (self, href))
            return
        
        results = soup.find('div', { 'class' : 'search_results' }).findAll('div', { 'class' : 'restaurant'})
        
        for result in results:
            
            try:
                name = result.find('h3').find('a').getText().strip()
            except Exception:
                continue

            try:
                street = result.find('br').previousSibling.strip()
                locale = '{0}, {1}'.format(result.find('br').nextSibling.strip(), 'CA')
                addr = '{0}, {1}'.format(street, locale)
            except Exception:
                addr = ''
                utils.log("[%s] error parsing %s (%s)" % (self, addr, href))
                continue 
            
            if addr == '':
                continue
                
            if name == '':
                continue 
            
            if (name, addr) in self._seen:
                continue
            
            self._seen.add((name, addr))
            
            entity = Entity()
            entity.subcategory = "restaurant"
            entity.title   = name
            entity.address = addr
            entity.sources.sfgate = { }
            
            self._output.put(entity)
        
        # try the next page
        try: 
            href_get = soup.find('li', { 'class' : 'next' }).find('a').get('href')
            next_page = '{0}{1}'.format('http://www.sfgate.com', href_get)
        except Exception: 
            next_page = ''
        
        if next_page != '':
            pool.spawn(self._parseResultsPage, pool, next_page)
Пример #8
0
 def _parseRestaurantPage(self, pool, region_name, city_name, restaurant_name, href):
     utils.log("[%s] parsing restaurant '%s.%s.%s' (%s)" % (self, region_name, city_name, restaurant_name, href))
     
     try:
         soup = utils.getSoup(href)
     except:
         utils.printException()
         utils.log("[%s] error downloading page %s" % (self, href))
         return
     
     # parse the address for the current restaurant
     addr   = soup.find('div', {'class' : 'address'})
     street = addr.find('span', {'class' : 'street'}).getText().strip()
     geo    = addr.find('span', {'class' : 'geo'}).getText().strip()
     
     address = "%s, %s" % (street, geo)
     
     # add the current restaurant to the output for this crawler
     entity = Entity()
     entity.subcategory = "restaurant"
     entity.title   = restaurant_name
     entity.address = address
     entity.sources.zagat = {
         'zurl' : self.base + href, 
     }
     
     #self._globals['soup'] = soup
     # parse cuisine
     header = soup.find('div', {'id' : "block-zagat_restaurants-14"})
     if header is not None:
         header = header.find('ul').find('li', {'class' : 'first'})
         
         if header is not None:
             entity.cuisine = header.getText()
     
     # parse website
     site = soup.find('span', {'class' : 'website'})
     if site is not None:
         site = site.find('a')
         
         if site is not None:
             entity.site = site.get('href')
     
     # parse preview image
     img = soup.find('div', {'id' : 'content'}).find('div', {'class' : 'photo'})
     if img is not None:
         img = img.find('img')
         
         if img is not None:
             entity.image = img.get('src')
     
     self._output.put(entity)
Пример #9
0
 def _parseRestaurantPage(self, pool, queue, url, name, base=False):
     utils.log('[%s] parsing restaurant page %s (%s)' % (self, name, url))
     
     try:
         soup = utils.getSoup(url)
     except:
         #utils.printException()
         utils.log("[%s] error downloading page %s (%s)" % (self, name, url))
         return
     
     content = soup.find('div', { 'id' : 'content'})
     
     if content is None:
         return
     
     entity = Entity()
     entity.title = content.find('h1').getText()
     entity.subcategory = "restaurant"
     entity.seattletimes = {}
     
     details = content.find('div', {'id' : 'edbtext'})
     desc    = details.find('p').getText()
     if desc is not None:
         entity.desc = desc
     
     details = details.findAll('p', {'class' : 'list'})
     address = details[0].renderContents().strip().replace('<br />', '')
     address = re.sub('[ \n\t]+', ' ', address)
     entity.address = address
     
     if len(details) > 1:
         site = details[1].get('href')
         if site is not None:
             entity.site = site
     
     if len(details) > 2:
         hoursOfOperation = details[2].getText()
         if hoursOfOperation is not None:
             entity.hoursOfOperation = hoursOfOperation
     
     key = (entity.title, entity.address)
     if key in self.seen or '(closed)' in entity.title.lower():
         return
     
     self.seen.add(key)
     self._output.put(entity)
Пример #10
0
 def _parseDetailPage(self, name, href, subcategory):
     try:
         soup = utils.getSoup(href)
     except urllib2.HTTPError:
         utils.log("[%s] error parsing %s (%s)" % (self, name, href))
         return
     
     summ = soup.find('div', {'class' : 'summary-address'})
     
     try:
         addrp = summ.find('p', {'class' : 'adr'})
         
         street_addr = addrp.find('span', {'class' : 'street-address'}).getText().strip()
         locality    = addrp.find('span', {'class' : 'locality'}).getText().strip()
         region      = addrp.find('span', {'class' : 'region'}).getText().strip()
         
         try:
             postal_code = addrp.find('span', {'class' : 'postal-code'}).getText().strip()
         except AttributeError:
             postal_code = ""
         
         addr = "%s, %s, %s %s" % (street_addr, locality, region, postal_code)
     except AttributeError:
         try:
             p = summ.find('p').getText()
             r = re.compile('(.*)nr\. ', re.DOTALL)
             m = r.match(p)
             
             if m is None:
                 r = re.compile('(.*)at[. ]', re.DOTALL)
                 m = r.match(p)
             
             addr = m.groups()[0].replace('\n', ' ').strip()
         except AttributeError:
             utils.log("[%s] error parsing %s (%s)" % (self, name, href))
             return
     
     entity = Entity()
     entity.subcategory = subcategory
     entity.title   = name
     entity.address = addr
     entity.nymag = { }
     
     self._output.put(entity)
Пример #11
0
    def _parseEntity(self, row, count):
        # utils.log("[%s] parsing entity %d" % (self, count))

        name = row["name"].lower().strip()
        collapsed = False

        if name in to_collapse:
            if to_collapse[name]:
                self.numCollapsed += 1
                return

            to_collapse[name] = True
            collapsed = True

        # record how many times we've encountered each restaurant
        # if not hasattr(self, 'seen'):
        #    self.seen = {}
        # if name in self.seen:
        #    self.seen[name] += 1
        # else:
        #    self.seen[name] = 1

        entity = Entity()
        entity.subcategory = "restaurant"
        entity.factual = {"table": "US_Restaurants_V2.csv"}

        if not collapsed:
            address = FactualUtils.parseAddress(row)
            if address is not None:
                entity.address = address

        for srcKey, destKey in self._map.iteritems():
            if srcKey in row and row[srcKey]:
                entity[destKey] = row[srcKey]

        self._output.put(entity)
Пример #12
0
 def _parseResultsPage(self, pool, href):
     try:
         soup = utils.getSoup(href)
     except urllib2.HTTPError:
         utils.log("[%s] error parsing page %s" % (self, href))
         return
     
     results = soup.find('div', { 'id' : 'searchResults' }).findAll('td', { 'class' : 'start' })
     
     for result in results:
         
         try:
             name = result.find('a').getText().strip()
         except AttributeError:
             utils.log("[%s] error parsing %s (%s)" % (self, name, href))
             return
             
         x = 0 
         
         for r in result.findAll('br'):
             x+=1
         
         if x == 3: 
             try:
                 addr = '{0}, {1}'.format(result.find('a').nextSibling.strip(), result.find('br').nextSibling.strip())
             except Exception:
                 utils.log("[%s] error parsing %s (%s)" % (self, addr, href))
                 return
                 
         elif x == 4:
             try:
                 addr = '{0}, {1}'.format(result.contents[3].strip(), result.contents[5].strip())
             except Exception:
                 utils.log("[%s] error parsing %s (%s)" % (self, addr, href))
                 return 
                 
         else: 
             addr = ''
         
         if addr == '':
             continue 
         
         if 'CLOSED' in name:
             continue
         
         if (name, addr) in self._seen:
             continue
         
         self._seen.add((name, addr))
         
         entity = Entity()
         entity.subcategory = "restaurant"
         entity.title   = name
         entity.address = addr
         entity.sources.bostonmag = { }
         
         self._output.put(entity)
     
     # try the next page
     next_page_ending = soup.find('div', { 'class' : 'right_align' }).findAll('a')
     next_page = ''
     
     for n in next_page_ending: 
         if 'Next' in str(n):
             next_page = href.replace(href[href.find('?'):], n.get('href'))
         else:
             pass
     
     if next_page != '':
         pool.spawn(self._parseResultsPage, pool, next_page)
Пример #13
0
 def _parseResultsPage(self, pool, href):
     try:
         soup = utils.getSoup(href)
     except urllib2.HTTPError:
         utils.log("[%s] error parsing page %s" % (self, href))
         return
     
     results = soup.find('ul', { 'id' : 'search_pagination' }).findAll('div', { 'class' : 'listing_item' })
     
     for result in results:
         try:
             name = result.find('h2').getText().strip()
         except AttributeError:
             utils.log("[%s] error parsing %s (%s)" % (self, name, href))
             return
         
         try:
             addr = result.find('span', { 'class' : 'address' }).getText().strip()
         except AttributeError:
             utils.log("[%s] error parsing %s (%s)" % (self, addr, href))
             return
         
         if addr == '':
             continue 
             
         if 'CLOSED' in name:
             continue
         
         if addr in self._seen:
             continue
             
         self._seen.add(addr)
         
         if name in self._count:
             if self._count[name] < 3:
                 self._count[name] = self._count[name] + 1 
             else: 
                 continue
         
         else:   
             self._count[name] = 1 
     
         entity = Entity()
         entity.subcategory = "restaurant"
         entity.title   = name
         entity.address = addr
         entity.sources.latimes = { }
         
         self._output.put(entity)
     
     #try the next page
     
     try:
         next_page = soup.find('a', {'class': 'next_page'}).get("href")
         if next_page != '':
             next_page_url = "{0}{1}".format('http://findlocal.latimes.com', next_page)
             pool.spawn(self._parseResultsPage, pool, next_page_url)
     except AttributeError:
         # crawling of pages is done
         #utils.log("Done crawling: %s" % href)
         pass
Пример #14
0
 def _parseResultsPage(self, pool, url, offset=0, base=False):
     utils.log('[%s] parsing page %s' % (self, url))
     max_offset = 8
     
     if offset < max_offset:
         # optimistically process the next results page before processing this one
         if 'start=' in url:
             start = self.start_re.match(url).groups()[0]
             nexti = int(start) + self.results_per_page
             url2  = url.replace('start=%s' % start, 'start=%d' % nexti)
         else:
             url2  = "%s&start=%d" % (url, self.results_per_page)
         
         pool.spawn(self._parseResultsPage, pool, url2, offset + 1)
     
     try:
         soup = utils.getSoup(url)
     except:
         utils.printException()
         utils.log("[%s] error downloading page %s" % (self, url))
         return
     
     if offset >= max_offset:
         next_pagel = soup.find('a', {'id' : 'pager_page_next'})
         
         if next_pagel is not None:
             href = self.base + next_pagel.get('href')
             pool.spawn(self._parseResultsPage, pool, href, 0)
             time.sleep(0.01)
     
     if base:
         categories = soup.findAll('a', {'id' : self.category_re})
         
         if categories is not None:
             for category in categories:
                 href = self.base + category.get('href')
                 pool.spawn(self._parseResultsPage, pool, href, 0)
             
             # yield so other threads have a chance to start working
             time.sleep(0.01)
     
     separator = '___'
     results   = soup.findAll('div', {'class' : re.compile('businessresult')})
     
     if results is None:
         return
     
     for result in results:
         entity = Entity()
         entity.subcategory = 'restaurant'
         entity.sources.yelp = { }
         
         titlel = result.find('a')
         title  = titlel.getText()
         entity.title = self.title_re.match(title).groups()[0]
         entity.yurl  = self.base + titlel.get('href')
         
         addr   = result.find('address').getText(separator)
         match  = self.address_re.match(addr).groups()
         
         entity.address = "%s, %s" % (match[0], match[1])
         entity.phone = match[2]
         
         rating = result.find('img')
         if rating is not None:
             entity.yrating = float(self.rating_reviews_re.match(rating.get('title')).groups()[0])
         
         reviews = result.find('span', {'class' : 'reviews'})
         if reviews is not None:
             entity.yreviews = int(self.rating_reviews_re.match(reviews.getText()).groups()[0])
         
         key = (entity.title, entity.address)
         if key not in self.seen:
             self.seen.add(key)
             self._output.put(entity)
Пример #15
0
    def _parseResultsPage(self, pool, href):
        try:
            soup = utils.getSoup(href)
        except urllib2.HTTPError:
            utils.log("[%s] error parsing page %s" % (self, href))
            return
        
        results = soup.find('div', { 'class' : 'searchresults' }).findAll('div', { 'class' : 'fs1-sans' })
        
        for result in results:
            
            if 'Price' in result.getText():
                continue 
                
            if 'Kid' in result.getText():
                continue
                
            if 'Other' in result.getText():
                continue
            
            if 'Wheelchair' in result.getText():
                continue 
                
            if 'Cuisines' in result.getText():
                continue
                
            if 'Rating' in result.getText():
                continue
            
            if 'Latest' in result.getText():
                continue 
            
            try:
                name = result.find('strong').getText().strip()
            except Exception:
                continue

            try:
                addr = '{0} {1}, {2}, {3}'.format(result.find('span').getText(), 
                                                  result.find('span').findNext('span').getText(), 
                                                  result.find('span').findNext('span').findNext('span').getText(), 
                                                  result.find('span').findNext('span').findNext('span').findNext('span').getText())
            except Exception:
                addr = ''
                utils.log("[%s] error parsing %s (%s)" % (self, addr, href))
                continue
                        
            if addr == '':
                continue
                
            if name == '':
                continue 
            
            if (name, addr) in self._seen:
                continue
            
            self._seen.add((name, addr))
            
            entity = Entity()
            entity.subcategory = "restaurant"
            entity.title   = name
            entity.address = addr
            entity.sources.washmag = { }
            
            self._output.put(entity)
        
        return 
Пример #16
0
    def _parseResultsPage(self, pool, href):
        try:
            soup = utils.getSoup(href)
        except urllib2.HTTPError:
            utils.log("[%s] error parsing page %s" % (self, href))
            return
        
        results = soup.find('td', { 'id' : 'search-results' }).findAll('tr')
        
        for result in results:
            
            try:
                name = result.find('td', { 'class' : 'business-name' }).find('a').getText().strip()
            except Exception:
                continue

            try:
                result.find('td', { 'class' : 'contact' }).find('br').previousSibling.strip()
                result.find('td', { 'class' : 'contact' }).find('br').nextSibling.strip()
                addr = '{0}, {1}'.format(result.find('td', { 'class' : 'contact' }).find('br').previousSibling.strip(), result.find('td', { 'class' : 'contact' }).find('br').nextSibling.strip())
            except Exception:
                addr = ''
                utils.log("[%s] error parsing %s (%s)" % (self, addr, href))
                continue
            
            if 'OPENING SOON' in result.find('td', { 'class' : 'categories' }).getText(): 
                continue
            
            if addr == '':
                continue
                
            if name == '':
                continue 
            
            if 'CLOSED' in name:
                continue
            
            if (name, addr) in self._seen:
                continue
            
            self._seen.add((name, addr))
            
            entity = Entity()
            entity.subcategory = "restaurant"
            entity.title   = name
            entity.address = addr
            entity.sources.chicagomag = { }
            
            self._output.put(entity)
        
        # try the next page
        next_page_all= soup.find('div', { 'id' : 'pager' }).findAll('a')
        next_page = ''
        
        for n in next_page_all: 
            if 'Next' in n.getText():
                next_page = n.get('href')
            else:
                pass
        
        if next_page != '':
            pool.spawn(self._parseResultsPage, pool, next_page)