Пример #1
0
 def _parseRestaurantPage(self, pool, region_name, area_name, restaurant_name, href):
     utils.log("[%s] parsing restaurant '%s.%s.%s' (%s)" % (self, region_name, area_name, restaurant_name, href))
     
     try:
         soup = utils.getSoup(href)
     except:
         utils.log("[%s] error downloading page %s" % (self, href))
         return
     
     # parse the address for the current restaurant
     addr     = soup.find('span', {'class' : 'adr'})
     street   = addr.find('span', {'class' : 'street-address'}).getText().strip()
     locality = addr.find('span', {'class' : 'locality'}).getText().strip()
     region   = addr.find('span', {'class' : 'region'}).getText().strip()
     zipcode  = addr.find('a', {'class' : re.compile('postal-code')}).getText().strip()
     
     address = "%s, %s, %s %s" % (street, locality, region, zipcode)
     
     # add the current restaurant to the output for this crawler
     entity = Entity()
     entity.subcategory = "restaurant"
     entity.title   = restaurant_name
     entity.address = address
     entity.sources.urbanspoon = {
         'uurl' : href, 
     }
     
     self._output.put(entity)
Пример #2
0
 def _parseRow(self, row):
     retain_result = self._filter(row)
     
     if not retain_result:
         self.numFiltered += 1
         return
     
     entity = Entity()
     entity.subcategory = self.subcategories[0]
     
     if isinstance(retain_result, dict):
         for col, value in retain_result.iteritems():
             if value is not None:
                 entity[col] = value
     
     for k in row:
         if k not in self._columnMap:
             continue
         
         k2 = self._columnMap[k]
         if k2 is None:
             continue
         
         value = row[k]
         entity[k2] = row[k]
     
     utils.log(entity.title)
     self._output.put(entity)
Пример #3
0
 def _parseEntity(self, sheet, index, numEntities):
     if numEntities > 100 and ((index - 1) % (numEntities / 100)) == 0:
         utils.log("[%s] done parsing %s" % \
             (self.NAME, utils.getStatusStr(index - 1 - Globals.options.offset, numEntities)))
         time.sleep(0.1)
     
     row = sheet.row_values(index)
     
     entity = Entity()
     entity.subcategory = "restaurant"
     entity.title = row[1]
     entity.address = row[3] + ', ' + \
                      row[4] + ', ' + \
                      row[5] + ' ' + \
                      row[6]
     
     entity.openTable = {
         'rid' : int(row[8]), 
         'reserveURL' : row[9], 
         'countryID' : row[10], 
         'metroName' : row[0], 
         'neighborhoodName' : row[2], 
     }
     
     # don't make external calls to opentable in test mode
     if not Globals.options.test:
         result = OpenTableParser.parseEntity(entity)
         if result is None:
             return
     
     if entity is not None:
         #print entity.title
         #from pprint import pprint
         #pprint(entity.getDataAsDict())
         self._output.put(entity)
Пример #4
0
    def _parseResultsPage(self, pool, href):
        try:
            soup = utils.getSoup(href)
        except urllib2.HTTPError:
            utils.log("[%s] error parsing page %s" % (self, href))
            return
        
        results = soup.find('div', { 'name' : 'LocationDirectory' }).findAll('h3')
        
        for result in results:
            
            try:
                name = result.find('a').getText().strip()
            except Exception:
                continue

            try:
                raw_address = result.findNext('span', { 'class' : 'address'}).getText()
                street = raw_address[0:raw_address.find('(')].strip()
                locale = raw_address[raw_address.find(')')+1:raw_address.find('CA')+2].strip()
                addr = '{0}, {1}'.format(street, locale)
            except Exception:
                addr = ''
                utils.log("[%s] error parsing %s (%s)" % (self, addr, href))
                continue 
            
            if addr == '':
                continue
                
            if name == '':
                continue 
            
            if (name, addr) in self._seen:
                continue
            
            self._seen.add((name, addr))
            
            entity = Entity()
            entity.subcategory = "bar"
            entity.title   = name
            entity.address = addr
            entity.sources.sfweekly = { }
            
            self._output.put(entity)
        
        # try the next page
        try: 
            pagination = soup.find('span', { 'class' : 'Pagination' }).getText()
            if 'Next' in pagination:
                pagination = soup.find('span', { 'class' : 'Pagination' })
                href_get = pagination.find('span', { 'class' : 'PaginationSelected' }).findNext('a').get('href')
                next_page = '{0}{1}'.format('http://www.sfweekly.com', href_get)
            else: 
                next_page = '' 
        except Exception: 
            next_page = ''
        
        if next_page != '':
            pool.spawn(self._parseResultsPage, pool, next_page)
Пример #5
0
    def _parseResultsPage(self, pool, href):
        try:
            soup = utils.getSoup(href)
        except urllib2.HTTPError:
            utils.log("[%s] error parsing page %s" % (self, href))
            return
        
        results = soup.find('div', { 'class' : 'split-right-column' }).findAll('div', { 'class' : 'clear' })
        
        for result in results:
            
            try:
                name = result.findNext('div').find('h2').find('a').getText().strip()
            except Exception:
                continue

            try:
                street = result.findNext('div').find('address').getText()
                locale = '{0}, {1}'.format('Los Angeles', 'CA')
                addr = '{0}, {1}'.format(street, locale)
            except Exception:
                addr = ''
                continue 
            
            if addr == '':
                continue
                
            if name == '':
                continue 
            
            if (name, addr) in self._seen:
                continue
            
            self._seen.add((name, addr))
            
            entity = Entity()
            if 'Bars' in result.findNext('span').getText():
                entity.subcategory = "bar"
            else:
                entity.subcategory = "restaurant"
            
            entity.title   = name
            entity.address = addr
            entity.sources.timeout_la = { }
            
            self._output.put(entity)
        
        # try the next page
        try: 
            href_get = soup.find('div', { 'class' : 'next' }).find('a').get('href')
            next_page = '{0}{1}'.format('http://www.timeout.com', href_get)
        except Exception: 
            next_page = ''
        
        if next_page != '':
            pool.spawn(self._parseResultsPage, pool, next_page)
Пример #6
0
 def _parseResultsPage(self, pool, href):
     try:
         soup = utils.getSoup(href)
     except urllib2.HTTPError:
         utils.log("[%s] error parsing page %s" % (self, href))
         return
     
     results = soup.findAll('h3')
     for result in results:
         try:
             name = result.find('span', { 'style' : 'cursor:pointer;' }).getText().strip()
         except AttributeError:
             utils.log("[%s] error parsing %s (%s)" % (self, name, href))
             return
         
         try:
             address1 = result.findNext('span', { 'class' : 'addresslinecaps' }).getText().strip()
             if '(' in address1:
                 # sf mag does not provide any city, state or zip information, 
                 # so inserting basic universal info manually.
                 addr = '{0}, {1}'.format(address1.split('(')[0].strip(), 'San Francisco, CA')
             else: 
                 addr = '{0}, {1}'.format(address1, 'San Francisco, CA') 
         except AttributeError:
             utils.log("[%s] error parsing %s (%s)" % (self, addr, href))
             return
         
         entity = Entity()
         entity.subcategory = "restaurant"
         entity.title   = name
         entity.address = addr
         entity.sources.sfmag = { }
         
         self._output.put(entity)
     
     #locate total pages and compare against current page num to determine if we should iterate again
     try:
         total_pages = soup.find('span', { 'class' : 'last' }).findPrevious('span').getText().strip()
     except AttributeError:
         # crawling of pages is done
         return
     
     index = href.find('&page=')
     end = href.find('&keyword')
     page = href[index+6:end]
     
     if int(page) <= int(total_pages)-1:
         next_page = href.replace('&page=' + str(page), '&page=' + str(int(page)+1))
         pool.spawn(self._parseResultsPage, pool, next_page)
     else:
         return
     
     time.sleep(0.01)
Пример #7
0
    def _parseResultsPage(self, pool, href):
        try:
            soup = utils.getSoup(href)
        except urllib2.HTTPError:
            utils.log("[%s] error parsing page %s" % (self, href))
            return
        
        results = soup.find('div', { 'class' : 'search_results' }).findAll('div', { 'class' : 'restaurant'})
        
        for result in results:
            
            try:
                name = result.find('h3').find('a').getText().strip()
            except Exception:
                continue

            try:
                street = result.find('br').previousSibling.strip()
                locale = '{0}, {1}'.format(result.find('br').nextSibling.strip(), 'CA')
                addr = '{0}, {1}'.format(street, locale)
            except Exception:
                addr = ''
                utils.log("[%s] error parsing %s (%s)" % (self, addr, href))
                continue 
            
            if addr == '':
                continue
                
            if name == '':
                continue 
            
            if (name, addr) in self._seen:
                continue
            
            self._seen.add((name, addr))
            
            entity = Entity()
            entity.subcategory = "restaurant"
            entity.title   = name
            entity.address = addr
            entity.sources.sfgate = { }
            
            self._output.put(entity)
        
        # try the next page
        try: 
            href_get = soup.find('li', { 'class' : 'next' }).find('a').get('href')
            next_page = '{0}{1}'.format('http://www.sfgate.com', href_get)
        except Exception: 
            next_page = ''
        
        if next_page != '':
            pool.spawn(self._parseResultsPage, pool, next_page)
Пример #8
0
 def _parseEntity(self, row, count):
     #utils.log("[%s] parsing entity %d" % (self.NAME, count))
     
     entity = Entity()
     entity.subcategory = "app"
     
     entity.factual = {
         'table' : 'iPhone_Apps.csv'
     }
     
     for srcKey, destKey in self._map.iteritems():
         if srcKey in row and row[srcKey] and len(row[srcKey]) > 0:
             entity[destKey] = row[srcKey]
     
     self._output.put(entity)
Пример #9
0
def main():
    options, args = parseCommandLine()
    
    api = MongoStampedAPI()
    matcher = EntityMatcher(api, options)
    
    keep = Entity()
    keep.entity_id = args[-1]
    
    remove = []
    for arg in args[:-1]:
        entity = Entity()
        entity.entity_id = arg
        remove.append(entity)
    
    matcher.resolveDuplicates(keep, remove, override=options.force)
Пример #10
0
 def _parseResultsPage(self, queue, url, name, depth):
     
     try:
         soup = utils.getSoup(url)
     except:
         utils.printException()
         utils.log("[%s] error downloading page %s (%s)" % (self, name, url))
         return
     
     if depth < self.max_depth:
         # extract and parse subcategory pages
         category_ul = soup.find('ul', {'id' : 'zg_browseRoot'})
         
         if category_ul is not None:
             while True:
                 temp_ul = category_ul.find('ul')
                 if temp_ul is None:
                     break
                 else:
                     category_ul = temp_ul
             
             categories = category_ul.findAll('a')
             
             for category in categories:
                 href = category.get('href')
                 name = utils.normalize(category.getText())
                 
                 queue.put_nowait((href, name, depth + 1))
     
     self._globals['books'] = soup
     
     rss_link = soup.find('div', {'id' : 'zg_rssLinks'})
     if rss_link is None:
         return
     
     rss_link = rss_link.findAll('a')[1].get('href')
     if rss_link in self.seen:
         return
     
     self.seen.add(rss_link)
     
     entity = Entity()
     entity.title = rss_link
     entity.subcategory = 'book'
     
     self._output.put(entity)
Пример #11
0
 def _parseDetailPage(self, name, href, subcategory):
     try:
         soup = utils.getSoup(href)
     except urllib2.HTTPError:
         utils.log("[%s] error parsing %s (%s)" % (self, name, href))
         return
     
     summ = soup.find('div', {'class' : 'summary-address'})
     
     try:
         addrp = summ.find('p', {'class' : 'adr'})
         
         street_addr = addrp.find('span', {'class' : 'street-address'}).getText().strip()
         locality    = addrp.find('span', {'class' : 'locality'}).getText().strip()
         region      = addrp.find('span', {'class' : 'region'}).getText().strip()
         
         try:
             postal_code = addrp.find('span', {'class' : 'postal-code'}).getText().strip()
         except AttributeError:
             postal_code = ""
         
         addr = "%s, %s, %s %s" % (street_addr, locality, region, postal_code)
     except AttributeError:
         try:
             p = summ.find('p').getText()
             r = re.compile('(.*)nr\. ', re.DOTALL)
             m = r.match(p)
             
             if m is None:
                 r = re.compile('(.*)at[. ]', re.DOTALL)
                 m = r.match(p)
             
             addr = m.groups()[0].replace('\n', ' ').strip()
         except AttributeError:
             utils.log("[%s] error parsing %s (%s)" % (self, name, href))
             return
     
     entity = Entity()
     entity.subcategory = subcategory
     entity.title   = name
     entity.address = addr
     entity.nymag = { }
     
     self._output.put(entity)
Пример #12
0
 def _parseResultsPage(self, pool, queue, url, name, base=False):
     utils.log('[%s] parsing page %s (%s)' % (self, name, url))
     
     try:
         html = utils.getFile(url)
         html = html.replace("header>", "div>") 
         soup = BeautifulSoup(html)
     except:
         #utils.printException()
         utils.log("[%s] error downloading page %s (%s)" % (self, name, url))
         return
     
     # extract and parse the rest of the paginated results
     if base:
         page = soup.find('nav').find('span').getText()
         num_pages = int(self.page_re.match(page).groups()[0])
         
         for i in xrange(2, num_pages + 1):
             href = '%s&pg=%d' % (url, i)
             
             queue.put_nowait((href, name))
     
     results = soup.findAll('section', {'class' : 'CWListing'})
     
     for result in results:
         entity = Entity()
         entity.subcategory = "book"
         entity.awardAnnals = {}
         
         entity.title  = result.find('h4').find('a').getText().strip()
         entity.author = result.find('p', {'class' : 'creators'}).getText()
         
         key = (entity.title, entity.author)
         if key in self.seen:
             continue
         
         self.seen.add(key)
         self._output.put(entity)
Пример #13
0
 def _parseRestaurantPage(self, pool, region_name, city_name, restaurant_name, href):
     utils.log("[%s] parsing restaurant '%s.%s.%s' (%s)" % (self, region_name, city_name, restaurant_name, href))
     
     try:
         soup = utils.getSoup(href)
     except:
         utils.printException()
         utils.log("[%s] error downloading page %s" % (self, href))
         return
     
     # parse the address for the current restaurant
     addr   = soup.find('div', {'class' : 'address'})
     street = addr.find('span', {'class' : 'street'}).getText().strip()
     geo    = addr.find('span', {'class' : 'geo'}).getText().strip()
     
     address = "%s, %s" % (street, geo)
     
     # add the current restaurant to the output for this crawler
     entity = Entity()
     entity.subcategory = "restaurant"
     entity.title   = restaurant_name
     entity.address = address
     entity.sources.zagat = {
         'zurl' : self.base + href, 
     }
     
     #self._globals['soup'] = soup
     # parse cuisine
     header = soup.find('div', {'id' : "block-zagat_restaurants-14"})
     if header is not None:
         header = header.find('ul').find('li', {'class' : 'first'})
         
         if header is not None:
             entity.cuisine = header.getText()
     
     # parse website
     site = soup.find('span', {'class' : 'website'})
     if site is not None:
         site = site.find('a')
         
         if site is not None:
             entity.site = site.get('href')
     
     # parse preview image
     img = soup.find('div', {'id' : 'content'}).find('div', {'class' : 'photo'})
     if img is not None:
         img = img.find('img')
         
         if img is not None:
             entity.image = img.get('src')
     
     self._output.put(entity)
Пример #14
0
    def _parseEntity(self, row, count):
        # utils.log("[%s] parsing entity %d" % (self, count))

        name = row["name"].lower().strip()
        collapsed = False

        if name in to_collapse:
            if to_collapse[name]:
                self.numCollapsed += 1
                return

            to_collapse[name] = True
            collapsed = True

        # record how many times we've encountered each restaurant
        # if not hasattr(self, 'seen'):
        #    self.seen = {}
        # if name in self.seen:
        #    self.seen[name] += 1
        # else:
        #    self.seen[name] = 1

        entity = Entity()
        entity.subcategory = "restaurant"
        entity.factual = {"table": "US_Restaurants_V2.csv"}

        if not collapsed:
            address = FactualUtils.parseAddress(row)
            if address is not None:
                entity.address = address

        for srcKey, destKey in self._map.iteritems():
            if srcKey in row and row[srcKey]:
                entity[destKey] = row[srcKey]

        self._output.put(entity)
Пример #15
0
 def _parseRestaurantPage(self, pool, queue, url, name, base=False):
     utils.log('[%s] parsing restaurant page %s (%s)' % (self, name, url))
     
     try:
         soup = utils.getSoup(url)
     except:
         #utils.printException()
         utils.log("[%s] error downloading page %s (%s)" % (self, name, url))
         return
     
     content = soup.find('div', { 'id' : 'content'})
     
     if content is None:
         return
     
     entity = Entity()
     entity.title = content.find('h1').getText()
     entity.subcategory = "restaurant"
     entity.seattletimes = {}
     
     details = content.find('div', {'id' : 'edbtext'})
     desc    = details.find('p').getText()
     if desc is not None:
         entity.desc = desc
     
     details = details.findAll('p', {'class' : 'list'})
     address = details[0].renderContents().strip().replace('<br />', '')
     address = re.sub('[ \n\t]+', ' ', address)
     entity.address = address
     
     if len(details) > 1:
         site = details[1].get('href')
         if site is not None:
             entity.site = site
     
     if len(details) > 2:
         hoursOfOperation = details[2].getText()
         if hoursOfOperation is not None:
             entity.hoursOfOperation = hoursOfOperation
     
     key = (entity.title, entity.address)
     if key in self.seen or '(closed)' in entity.title.lower():
         return
     
     self.seen.add(key)
     self._output.put(entity)
Пример #16
0
 def _parseListPage(self, pool, queue, url, name, base=False):
     utils.log('[%s] parsing list page %s (%s)' % (self, name, url))
     
     try:
         soup = utils.getSoup(url)
     except:
         #utils.printException()
         utils.log("[%s] error downloading page %s (%s)" % (self, name, url))
         return
     
     results = soup.findAll('td', {'class' : 'summary'})
     
     for result in results:
         entity = Entity()
         entity.subcategory = "book"
         entity.nytimes = {}
         
         title = result.find('span', {'class' : 'bookName'}).getText().strip().title()
         if title.endswith(','):
             title = title[0:-1]
         
         entity.title = title
         
         details = result.getText(separator='___')
         details_match = self.details_re.match(details)
         
         if details_match:
             details_match    = details_match.groups()
             entity.author    = details_match[0]
             entity.publisher = details_match[1]
             entity.desc      = details_match[2]
         
         key = (entity.title, entity.author)
         if key in self.seen:
             continue
         
         self.seen.add(key)
         self._output.put(entity)
Пример #17
0
 def _parseResultsPage(self, pool, href):
     try:
         soup = utils.getSoup(href)
     except urllib2.HTTPError:
         utils.log("[%s] error parsing page %s" % (self, href))
         return
     
     results = soup.find('div', { 'id' : 'searchResults' }).findAll('td', { 'class' : 'start' })
     
     for result in results:
         
         try:
             name = result.find('a').getText().strip()
         except AttributeError:
             utils.log("[%s] error parsing %s (%s)" % (self, name, href))
             return
             
         x = 0 
         
         for r in result.findAll('br'):
             x+=1
         
         if x == 3: 
             try:
                 addr = '{0}, {1}'.format(result.find('a').nextSibling.strip(), result.find('br').nextSibling.strip())
             except Exception:
                 utils.log("[%s] error parsing %s (%s)" % (self, addr, href))
                 return
                 
         elif x == 4:
             try:
                 addr = '{0}, {1}'.format(result.contents[3].strip(), result.contents[5].strip())
             except Exception:
                 utils.log("[%s] error parsing %s (%s)" % (self, addr, href))
                 return 
                 
         else: 
             addr = ''
         
         if addr == '':
             continue 
         
         if 'CLOSED' in name:
             continue
         
         if (name, addr) in self._seen:
             continue
         
         self._seen.add((name, addr))
         
         entity = Entity()
         entity.subcategory = "restaurant"
         entity.title   = name
         entity.address = addr
         entity.sources.bostonmag = { }
         
         self._output.put(entity)
     
     # try the next page
     next_page_ending = soup.find('div', { 'class' : 'right_align' }).findAll('a')
     next_page = ''
     
     for n in next_page_ending: 
         if 'Next' in str(n):
             next_page = href.replace(href[href.find('?'):], n.get('href'))
         else:
             pass
     
     if next_page != '':
         pool.spawn(self._parseResultsPage, pool, next_page)
Пример #18
0
    def _parseResultsPage(self, pool, href):
        try:
            soup = utils.getSoup(href)
        except urllib2.HTTPError:
            utils.log("[%s] error parsing page %s" % (self, href))
            return
        
        results = soup.find('td', { 'id' : 'search-results' }).findAll('tr')
        
        for result in results:
            
            try:
                name = result.find('td', { 'class' : 'business-name' }).find('a').getText().strip()
            except Exception:
                continue

            try:
                result.find('td', { 'class' : 'contact' }).find('br').previousSibling.strip()
                result.find('td', { 'class' : 'contact' }).find('br').nextSibling.strip()
                addr = '{0}, {1}'.format(result.find('td', { 'class' : 'contact' }).find('br').previousSibling.strip(), result.find('td', { 'class' : 'contact' }).find('br').nextSibling.strip())
            except Exception:
                addr = ''
                utils.log("[%s] error parsing %s (%s)" % (self, addr, href))
                continue
            
            if 'OPENING SOON' in result.find('td', { 'class' : 'categories' }).getText(): 
                continue
            
            if addr == '':
                continue
                
            if name == '':
                continue 
            
            if 'CLOSED' in name:
                continue
            
            if (name, addr) in self._seen:
                continue
            
            self._seen.add((name, addr))
            
            entity = Entity()
            entity.subcategory = "restaurant"
            entity.title   = name
            entity.address = addr
            entity.sources.chicagomag = { }
            
            self._output.put(entity)
        
        # try the next page
        next_page_all= soup.find('div', { 'id' : 'pager' }).findAll('a')
        next_page = ''
        
        for n in next_page_all: 
            if 'Next' in n.getText():
                next_page = n.get('href')
            else:
                pass
        
        if next_page != '':
            pool.spawn(self._parseResultsPage, pool, next_page)
Пример #19
0
 def _parseResultsPage(self, pool, href):
     try:
         soup = utils.getSoup(href)
     except urllib2.HTTPError:
         utils.log("[%s] error parsing page %s" % (self, href))
         return
     
     results = soup.find('ul', { 'id' : 'search_pagination' }).findAll('div', { 'class' : 'listing_item' })
     
     for result in results:
         try:
             name = result.find('h2').getText().strip()
         except AttributeError:
             utils.log("[%s] error parsing %s (%s)" % (self, name, href))
             return
         
         try:
             addr = result.find('span', { 'class' : 'address' }).getText().strip()
         except AttributeError:
             utils.log("[%s] error parsing %s (%s)" % (self, addr, href))
             return
         
         if addr == '':
             continue 
             
         if 'CLOSED' in name:
             continue
         
         if addr in self._seen:
             continue
             
         self._seen.add(addr)
         
         if name in self._count:
             if self._count[name] < 3:
                 self._count[name] = self._count[name] + 1 
             else: 
                 continue
         
         else:   
             self._count[name] = 1 
     
         entity = Entity()
         entity.subcategory = "restaurant"
         entity.title   = name
         entity.address = addr
         entity.sources.latimes = { }
         
         self._output.put(entity)
     
     #try the next page
     
     try:
         next_page = soup.find('a', {'class': 'next_page'}).get("href")
         if next_page != '':
             next_page_url = "{0}{1}".format('http://findlocal.latimes.com', next_page)
             pool.spawn(self._parseResultsPage, pool, next_page_url)
     except AttributeError:
         # crawling of pages is done
         #utils.log("Done crawling: %s" % href)
         pass
Пример #20
0
 def _parse_dump(self, filename):
     f = gzip.open(filename, 'rb')
     context = iter(etree.iterparse(f, events=("start", "end")))
     
     event, root = context.next()
     
     nid_re              = re.compile('.*\/([0-9]*)$')
     language_re         = re.compile('.*\/languages$')
     match_genre_re      = re.compile('.*\/genres$')
     match_ratings_re    = re.compile('.*\/mpaa_ratings$')
     
     match_genre_func    = lambda c: re.match(match_genre_re, c.get('scheme')) is not None
     match_ratings_func  = lambda c: re.match(match_ratings_re, c.get('scheme')) is not None
     match_language_func = lambda c: re.match(language_re, c.get('scheme')) is not None
     
     count = 0
     bonus_materials = set()
     
     # loop through each XML catalog_title element and parse it as a movie Entity
     for event, elem in context:
         if event == "end" and elem.tag == "catalog_title":
             root.clear()
             
             try:
                 rating_elem = elem.find('average_rating')
                 if rating_elem is None:
                     continue
                 
                 entity = Entity()
                 nid = elem.find('id').text
                 nid = int(re.match(nid_re, nid).groups()[0])
                 
                 bonus_materials_elem = elem.find('.//bonus_materials')
                 if bonus_materials_elem is not None:
                     links = map(lambda l: l.get('href'), bonus_materials_elem.findall('link'))
                     
                     for link in links:
                         bonus_material_id = int(re.match(nid_re, link).groups()[0])
                         #bonus_material_id = re.match(bonus_materials_id_re, link).groups()[0]
                         bonus_materials.add(bonus_material_id)
                 
                 if nid in bonus_materials:
                     continue
                 
                 title = elem.find('title').get('regular')
                 titlel = title.lower()
                 
                 if 'bonus material' in titlel:
                     continue
                 
                 entity.title = title
                 entity.nid = nid
                 entity.desc = elem.find('.//synopsis').text
                 entity.nrating = float(rating_elem.text)
                 
                 categories = elem.findall('category')
                 
                 genres = map(lambda c: c.get('label'), filter(match_genre_func, categories))
                 entity.ngenres = genres
                 
                 tv = False
                 for genre in genres:
                     if 'tv' in genre.lower():
                         tv = True
                         break
                 
                 if tv:
                     entity.subcategory = 'tv'
                 else:
                     entity.subcategory = 'movie'
                 
                 ratings = map(lambda c: c.get('label'), filter(match_ratings_func, categories))
                 if 1 == len(ratings):
                     entity.mpaa_rating = ratings[0]
                 
                 images = elem.find('.//box_art').findall('link')
                 if 3 == len(images) or 4 == len(images):
                     entity.tiny  = images[0].get('href')
                     entity.small = images[1].get('href')
                     entity.large = images[2].get('href')
                     
                     if 4 == len(images):
                         entity.hd = images[3].get('href')
                 
                 links = filter(lambda l: 'web page' == l.get('title'), elem.findall('link'))
                 if 1 == len(links):
                     entity.nurl = links[0].get('href')
                 
                 language_elem  = elem.find('.//languages_and_audio')
                 language_elems = filter(match_language_func, language_elem.findall('.//category'))
                 
                 release_year_elem = elem.find('release_year')
                 if release_year_elem is not None:
                     entity.original_release_date = release_year_elem.text
                 
                 duration = elem.find('runtime')
                 if duration is not None:
                     entity.track_length = duration.text
                 
                 languages = set()
                 for elem2 in language_elems:
                     languages.add(elem2.get('label').lower())
                 
                 if 'english' not in languages:
                     continue
                 
                 #utils.log(entity.title)
                 #pprint(entity.getDataAsDict())
                 
                 """
                 self._globals['n'] = elem
                 self._globals['s'] = etree.tostring(elem, pretty_print=True)
                 self._globals['e'] = entity
                 break
                 """
                 
                 self._output.put(entity)
                 count += 1
                 
                 # give the downstream consumer threads an occasional chance to work
                 if 0 == (count % 512):
                     time.sleep(0.1)
                 
                 elem.clear()
             except Exception, e:
                 utils.printException()
                 utils.log(elem.find('title').get('regular'))
Пример #21
0
 def _parse_series_page(self, name, url):
     if '**' in name or 'DUPLICATE' in name or name.startswith('.hack'):
         return
     
     utils.log('[%s] parsing page %s (%s)' % (self, name, url))
     
     try:
         soup = utils.getSoup(url)
     except:
         utils.printException()
         utils.log("[%s] error downloading page %s (%s)" % (self, name, url))
         return
     
     contents = soup.findAll('div', {'id' : 'content'})
     header = contents[0]
     
     h1 = header.find('h1') 
     title = h1.getText()
     h1.extract()
     
     entity = Entity()
     
     # parse basic show info
     entity.title = title
     entity.subcategory = 'tv'
     
     desc = header.getText().replace('\r\n', '\n')
     if len(desc) > 5:
         entity.desc = desc
     
     entity.sources.thetvdb_id = self._id_re.match(url).groups()[0]
     
     # parse images
     images = map(lambda img: img.get('src'), soup.findAll('img', {'class' : 'banner'}))
     types  = [ 'posters', 'fanart', 'graphical', ]
     
     for image_type in types:
         filtered_images = filter(lambda img: image_type in img, images)
         if len(filtered_images) > 0:
             entity.image = "%s%s" % (self.base, filtered_images[0])
             break
     
     info = contents[1].find('table').find('table')
     rows = info.findAll('tr')
     
     # parse detailed show info
     info_map = {
         0 : 'original_release_date', 
         3 : 'air_time', 
         4 : 'network_name', 
         5 : 'genre', 
     }
     
     for k, k2 in info_map.iteritems():
         try:
             value = rows[k].findAll('td')[1].getText()
             if len(value) > 0:
                 entity[k2] = value
         except:
             utils.printException()
             pass
     
     # parse cast
     try:
         actors = "%s%s" % (self.base, contents[-1].findAll('a')[-1].get('href'))
         actors_soup = utils.getSoup(actors)
         
         infotables = actors_soup.findAll('table', {'class' : 'infotable'})
         cast = []
         
         for infotable in infotables:
             text = infotable.find('td').getText(separator='___')
             match = self._actor_re.match(text)
             if match is not None:
                 groups = match.groups()
                 cast.append('%s as %s' % (groups[0].strip(), groups[1].strip()))
                 # TODO: record actor images
         
         if len(cast) > 0:
             entity.cast = ', '.join(cast)
     except:
         pass
     
     # parse seasons
     try:
         seasons = "%s%s" % (self.base, contents[2].findAll('a')[-1].get('href'))
         seasons_soup = utils.getSoup(seasons)
         
         rows = seasons_soup.find('table', {'id' : 'listtable'}).findAll('tr')[1:]
         
         highest_season = -1
         earliest = None
         latest   = None
         
         # each row is an episode; loop through each episode, recording the 
         # earliest and latest air date for the show overall and the number 
         # of seasons the show ran for.
         for row in rows:
             tds = row.findAll('td')
             episode = tds[0].getText()
             match = self._season_re.match(episode)
             
             if match is not None:
                 groups  = match.groups()
                 season  = int(groups[0])
                 episode = int(groups[1])
                 
                 if season > highest_season:
                     highest_season = season
                 
                 date  = tds[2].getText()
                 match = self._date_re.match(date)
                 
                 if match is not None:
                     year, month, day = match.groups()
                     date = datetime(year=int(year), month=int(month), day=int(day))
                     
                     if earliest is None or date < earliest:
                         earliest = date
                     
                     if latest is None or date > latest:
                         latest = date
         
         if highest_season > 0:
             entity.num_seasons = highest_season
         
         if earliest is not None:
             entity.earliest_air_date = earliest
         
         if latest is not None:
             entity.latest_air_date = latest
     except:
         utils.printException()
     
     entity2 = self._thetvdb.lookup(entity.sources.thetvdb_id)
     
     if entity2 is not None:
         if entity2.mpaa_rating is not None:
             entity.mpaa_rating = entity2.mpaa_rating
         if entity2.imdb_id is not None:
             entity.imdb_id     = entity2.imdb_id
     
     self._output.put(entity)
Пример #22
0
 def _parse_dump(self, filepath):
     f = gzip.open(filepath, 'rb')
     context = iter(etree.iterparse(f, events=("start", "end")))
     
     event, root = context.next()
     offset = 0
     count  = 0
     
     # loop through XML and parse each product element as a book Entity
     for event, elem in context:
         if event == "end" and elem.tag == "product" and elem.get('product_id') is not None:
             root.clear()
             
             if offset < Globals.options.offset:
                 offset += 1
                 continue
             
             if Globals.options.limit and count >= Globals.options.limit:
                 break
             
             try:
                 #assert 'books' == elem.find('.//primary').text.lower()
                 #assert 'USD' == elem.find('price').get('currency')
                 #assert float(elem.find('price').find('retail').text) >= 0.0
                 
                 entity = Entity()
                 entity.subcategory  = "book"
                 
                 entity.title        = elem.get('name')
                 entity.bid          = int(elem.get('product_id'))
                 entity.sku_number   = elem.get('sku_number')
                 entity.image        = elem.find('.//productImage').text
                 
                 entity.author       = elem.find('.//Author').text
                 entity.publisher    = elem.find('.//Publisher').text
                 entity.publish_date = elem.find('.//Publish_Date').text
                 isbn = elem.find('.//ISBN').text
                 
                 if isbn is None or len(isbn) <= 0:
                     continue
                 
                 entity.isbn         = isbn
                 
                 desc = elem.find('description')
                 is_english = 'nglish' in etree.tostring(desc)
                 
                 if not is_english:
                     continue
                 
                 #print etree.tostring(elem, pretty_print=True)
                 #self._globals['books'] = elem
                 #pprint(entity.value)
                 
                 self._output.put(entity)
                 count += 1
                 
                 # give the downstream consumer threads an occasional chance to work
                 if 0 == (count % 512):
                     time.sleep(0.1)
                 
                 parent = elem.getparent()
                 while True:
                     prev = elem.getprevious()
                     if prev is None:
                         break
                     parent.remove(prev)
                 
                 elem.clear()
             except Exception, e:
                 utils.printException()
Пример #23
0
 def _parseResultsPage(self, pool, url, offset=0, base=False):
     utils.log('[%s] parsing page %s' % (self, url))
     max_offset = 8
     
     if offset < max_offset:
         # optimistically process the next results page before processing this one
         if 'start=' in url:
             start = self.start_re.match(url).groups()[0]
             nexti = int(start) + self.results_per_page
             url2  = url.replace('start=%s' % start, 'start=%d' % nexti)
         else:
             url2  = "%s&start=%d" % (url, self.results_per_page)
         
         pool.spawn(self._parseResultsPage, pool, url2, offset + 1)
     
     try:
         soup = utils.getSoup(url)
     except:
         utils.printException()
         utils.log("[%s] error downloading page %s" % (self, url))
         return
     
     if offset >= max_offset:
         next_pagel = soup.find('a', {'id' : 'pager_page_next'})
         
         if next_pagel is not None:
             href = self.base + next_pagel.get('href')
             pool.spawn(self._parseResultsPage, pool, href, 0)
             time.sleep(0.01)
     
     if base:
         categories = soup.findAll('a', {'id' : self.category_re})
         
         if categories is not None:
             for category in categories:
                 href = self.base + category.get('href')
                 pool.spawn(self._parseResultsPage, pool, href, 0)
             
             # yield so other threads have a chance to start working
             time.sleep(0.01)
     
     separator = '___'
     results   = soup.findAll('div', {'class' : re.compile('businessresult')})
     
     if results is None:
         return
     
     for result in results:
         entity = Entity()
         entity.subcategory = 'restaurant'
         entity.sources.yelp = { }
         
         titlel = result.find('a')
         title  = titlel.getText()
         entity.title = self.title_re.match(title).groups()[0]
         entity.yurl  = self.base + titlel.get('href')
         
         addr   = result.find('address').getText(separator)
         match  = self.address_re.match(addr).groups()
         
         entity.address = "%s, %s" % (match[0], match[1])
         entity.phone = match[2]
         
         rating = result.find('img')
         if rating is not None:
             entity.yrating = float(self.rating_reviews_re.match(rating.get('title')).groups()[0])
         
         reviews = result.find('span', {'class' : 'reviews'})
         if reviews is not None:
             entity.yreviews = int(self.rating_reviews_re.match(reviews.getText()).groups()[0])
         
         key = (entity.title, entity.address)
         if key not in self.seen:
             self.seen.add(key)
             self._output.put(entity)
Пример #24
0
    def _parseResultsPage(self, pool, href):
        try:
            soup = utils.getSoup(href)
        except urllib2.HTTPError:
            utils.log("[%s] error parsing page %s" % (self, href))
            return
        
        results = soup.find('div', { 'class' : 'searchresults' }).findAll('div', { 'class' : 'fs1-sans' })
        
        for result in results:
            
            if 'Price' in result.getText():
                continue 
                
            if 'Kid' in result.getText():
                continue
                
            if 'Other' in result.getText():
                continue
            
            if 'Wheelchair' in result.getText():
                continue 
                
            if 'Cuisines' in result.getText():
                continue
                
            if 'Rating' in result.getText():
                continue
            
            if 'Latest' in result.getText():
                continue 
            
            try:
                name = result.find('strong').getText().strip()
            except Exception:
                continue

            try:
                addr = '{0} {1}, {2}, {3}'.format(result.find('span').getText(), 
                                                  result.find('span').findNext('span').getText(), 
                                                  result.find('span').findNext('span').findNext('span').getText(), 
                                                  result.find('span').findNext('span').findNext('span').findNext('span').getText())
            except Exception:
                addr = ''
                utils.log("[%s] error parsing %s (%s)" % (self, addr, href))
                continue
                        
            if addr == '':
                continue
                
            if name == '':
                continue 
            
            if (name, addr) in self._seen:
                continue
            
            self._seen.add((name, addr))
            
            entity = Entity()
            entity.subcategory = "restaurant"
            entity.title   = name
            entity.address = addr
            entity.sources.washmag = { }
            
            self._output.put(entity)
        
        return 
Пример #25
0
 def _parseEntity(self, result):
     entity = Entity()
     entity.subcategory = 'other'
     
     if 'titleNoFormatting' in result:
         entity.title = result['titleNoFormatting']
     
     if 'addressLines' in result:
         entity.address = string.joinfields(result['addressLines'], ', ')
         entity.subtitle = entity.address
     
     if 'lat' in result and 'lng' in result:
         entity.lat = float(result['lat'])
         entity.lng = float(result['lng'])
     
     if 'region' in result:
         entity.vicinity = result['region']
     
     if 'phoneNumbers' in result:
         phoneNumbers = result['phoneNumbers']
         
         if len(phoneNumbers) > 0:
             entity.phone = phoneNumbers[0]['number']
     
     entity.googleLocal = {}
     entity.titlel = entity.title.lower()
     
     return entity