Пример #1
0
 def _parse_series_page(self, name, url):
     if '**' in name or 'DUPLICATE' in name or name.startswith('.hack'):
         return
     
     utils.log('[%s] parsing page %s (%s)' % (self, name, url))
     
     try:
         soup = utils.getSoup(url)
     except:
         utils.printException()
         utils.log("[%s] error downloading page %s (%s)" % (self, name, url))
         return
     
     contents = soup.findAll('div', {'id' : 'content'})
     header = contents[0]
     
     h1 = header.find('h1') 
     title = h1.getText()
     h1.extract()
     
     entity = Entity()
     
     # parse basic show info
     entity.title = title
     entity.subcategory = 'tv'
     
     desc = header.getText().replace('\r\n', '\n')
     if len(desc) > 5:
         entity.desc = desc
     
     entity.sources.thetvdb_id = self._id_re.match(url).groups()[0]
     
     # parse images
     images = map(lambda img: img.get('src'), soup.findAll('img', {'class' : 'banner'}))
     types  = [ 'posters', 'fanart', 'graphical', ]
     
     for image_type in types:
         filtered_images = filter(lambda img: image_type in img, images)
         if len(filtered_images) > 0:
             entity.image = "%s%s" % (self.base, filtered_images[0])
             break
     
     info = contents[1].find('table').find('table')
     rows = info.findAll('tr')
     
     # parse detailed show info
     info_map = {
         0 : 'original_release_date', 
         3 : 'air_time', 
         4 : 'network_name', 
         5 : 'genre', 
     }
     
     for k, k2 in info_map.iteritems():
         try:
             value = rows[k].findAll('td')[1].getText()
             if len(value) > 0:
                 entity[k2] = value
         except:
             utils.printException()
             pass
     
     # parse cast
     try:
         actors = "%s%s" % (self.base, contents[-1].findAll('a')[-1].get('href'))
         actors_soup = utils.getSoup(actors)
         
         infotables = actors_soup.findAll('table', {'class' : 'infotable'})
         cast = []
         
         for infotable in infotables:
             text = infotable.find('td').getText(separator='___')
             match = self._actor_re.match(text)
             if match is not None:
                 groups = match.groups()
                 cast.append('%s as %s' % (groups[0].strip(), groups[1].strip()))
                 # TODO: record actor images
         
         if len(cast) > 0:
             entity.cast = ', '.join(cast)
     except:
         pass
     
     # parse seasons
     try:
         seasons = "%s%s" % (self.base, contents[2].findAll('a')[-1].get('href'))
         seasons_soup = utils.getSoup(seasons)
         
         rows = seasons_soup.find('table', {'id' : 'listtable'}).findAll('tr')[1:]
         
         highest_season = -1
         earliest = None
         latest   = None
         
         # each row is an episode; loop through each episode, recording the 
         # earliest and latest air date for the show overall and the number 
         # of seasons the show ran for.
         for row in rows:
             tds = row.findAll('td')
             episode = tds[0].getText()
             match = self._season_re.match(episode)
             
             if match is not None:
                 groups  = match.groups()
                 season  = int(groups[0])
                 episode = int(groups[1])
                 
                 if season > highest_season:
                     highest_season = season
                 
                 date  = tds[2].getText()
                 match = self._date_re.match(date)
                 
                 if match is not None:
                     year, month, day = match.groups()
                     date = datetime(year=int(year), month=int(month), day=int(day))
                     
                     if earliest is None or date < earliest:
                         earliest = date
                     
                     if latest is None or date > latest:
                         latest = date
         
         if highest_season > 0:
             entity.num_seasons = highest_season
         
         if earliest is not None:
             entity.earliest_air_date = earliest
         
         if latest is not None:
             entity.latest_air_date = latest
     except:
         utils.printException()
     
     entity2 = self._thetvdb.lookup(entity.sources.thetvdb_id)
     
     if entity2 is not None:
         if entity2.mpaa_rating is not None:
             entity.mpaa_rating = entity2.mpaa_rating
         if entity2.imdb_id is not None:
             entity.imdb_id     = entity2.imdb_id
     
     self._output.put(entity)
Пример #2
0
 def _parse_dump(self, filename):
     f = gzip.open(filename, 'rb')
     context = iter(etree.iterparse(f, events=("start", "end")))
     
     event, root = context.next()
     
     nid_re              = re.compile('.*\/([0-9]*)$')
     language_re         = re.compile('.*\/languages$')
     match_genre_re      = re.compile('.*\/genres$')
     match_ratings_re    = re.compile('.*\/mpaa_ratings$')
     
     match_genre_func    = lambda c: re.match(match_genre_re, c.get('scheme')) is not None
     match_ratings_func  = lambda c: re.match(match_ratings_re, c.get('scheme')) is not None
     match_language_func = lambda c: re.match(language_re, c.get('scheme')) is not None
     
     count = 0
     bonus_materials = set()
     
     # loop through each XML catalog_title element and parse it as a movie Entity
     for event, elem in context:
         if event == "end" and elem.tag == "catalog_title":
             root.clear()
             
             try:
                 rating_elem = elem.find('average_rating')
                 if rating_elem is None:
                     continue
                 
                 entity = Entity()
                 nid = elem.find('id').text
                 nid = int(re.match(nid_re, nid).groups()[0])
                 
                 bonus_materials_elem = elem.find('.//bonus_materials')
                 if bonus_materials_elem is not None:
                     links = map(lambda l: l.get('href'), bonus_materials_elem.findall('link'))
                     
                     for link in links:
                         bonus_material_id = int(re.match(nid_re, link).groups()[0])
                         #bonus_material_id = re.match(bonus_materials_id_re, link).groups()[0]
                         bonus_materials.add(bonus_material_id)
                 
                 if nid in bonus_materials:
                     continue
                 
                 title = elem.find('title').get('regular')
                 titlel = title.lower()
                 
                 if 'bonus material' in titlel:
                     continue
                 
                 entity.title = title
                 entity.nid = nid
                 entity.desc = elem.find('.//synopsis').text
                 entity.nrating = float(rating_elem.text)
                 
                 categories = elem.findall('category')
                 
                 genres = map(lambda c: c.get('label'), filter(match_genre_func, categories))
                 entity.ngenres = genres
                 
                 tv = False
                 for genre in genres:
                     if 'tv' in genre.lower():
                         tv = True
                         break
                 
                 if tv:
                     entity.subcategory = 'tv'
                 else:
                     entity.subcategory = 'movie'
                 
                 ratings = map(lambda c: c.get('label'), filter(match_ratings_func, categories))
                 if 1 == len(ratings):
                     entity.mpaa_rating = ratings[0]
                 
                 images = elem.find('.//box_art').findall('link')
                 if 3 == len(images) or 4 == len(images):
                     entity.tiny  = images[0].get('href')
                     entity.small = images[1].get('href')
                     entity.large = images[2].get('href')
                     
                     if 4 == len(images):
                         entity.hd = images[3].get('href')
                 
                 links = filter(lambda l: 'web page' == l.get('title'), elem.findall('link'))
                 if 1 == len(links):
                     entity.nurl = links[0].get('href')
                 
                 language_elem  = elem.find('.//languages_and_audio')
                 language_elems = filter(match_language_func, language_elem.findall('.//category'))
                 
                 release_year_elem = elem.find('release_year')
                 if release_year_elem is not None:
                     entity.original_release_date = release_year_elem.text
                 
                 duration = elem.find('runtime')
                 if duration is not None:
                     entity.track_length = duration.text
                 
                 languages = set()
                 for elem2 in language_elems:
                     languages.add(elem2.get('label').lower())
                 
                 if 'english' not in languages:
                     continue
                 
                 #utils.log(entity.title)
                 #pprint(entity.getDataAsDict())
                 
                 """
                 self._globals['n'] = elem
                 self._globals['s'] = etree.tostring(elem, pretty_print=True)
                 self._globals['e'] = entity
                 break
                 """
                 
                 self._output.put(entity)
                 count += 1
                 
                 # give the downstream consumer threads an occasional chance to work
                 if 0 == (count % 512):
                     time.sleep(0.1)
                 
                 elem.clear()
             except Exception, e:
                 utils.printException()
                 utils.log(elem.find('title').get('regular'))