def _parse_series_page(self, name, url): if '**' in name or 'DUPLICATE' in name or name.startswith('.hack'): return utils.log('[%s] parsing page %s (%s)' % (self, name, url)) try: soup = utils.getSoup(url) except: utils.printException() utils.log("[%s] error downloading page %s (%s)" % (self, name, url)) return contents = soup.findAll('div', {'id' : 'content'}) header = contents[0] h1 = header.find('h1') title = h1.getText() h1.extract() entity = Entity() # parse basic show info entity.title = title entity.subcategory = 'tv' desc = header.getText().replace('\r\n', '\n') if len(desc) > 5: entity.desc = desc entity.sources.thetvdb_id = self._id_re.match(url).groups()[0] # parse images images = map(lambda img: img.get('src'), soup.findAll('img', {'class' : 'banner'})) types = [ 'posters', 'fanart', 'graphical', ] for image_type in types: filtered_images = filter(lambda img: image_type in img, images) if len(filtered_images) > 0: entity.image = "%s%s" % (self.base, filtered_images[0]) break info = contents[1].find('table').find('table') rows = info.findAll('tr') # parse detailed show info info_map = { 0 : 'original_release_date', 3 : 'air_time', 4 : 'network_name', 5 : 'genre', } for k, k2 in info_map.iteritems(): try: value = rows[k].findAll('td')[1].getText() if len(value) > 0: entity[k2] = value except: utils.printException() pass # parse cast try: actors = "%s%s" % (self.base, contents[-1].findAll('a')[-1].get('href')) actors_soup = utils.getSoup(actors) infotables = actors_soup.findAll('table', {'class' : 'infotable'}) cast = [] for infotable in infotables: text = infotable.find('td').getText(separator='___') match = self._actor_re.match(text) if match is not None: groups = match.groups() cast.append('%s as %s' % (groups[0].strip(), groups[1].strip())) # TODO: record actor images if len(cast) > 0: entity.cast = ', '.join(cast) except: pass # parse seasons try: seasons = "%s%s" % (self.base, contents[2].findAll('a')[-1].get('href')) seasons_soup = utils.getSoup(seasons) rows = seasons_soup.find('table', {'id' : 'listtable'}).findAll('tr')[1:] highest_season = -1 earliest = None latest = None # each row is an episode; loop through each episode, recording the # earliest and latest air date for the show overall and the number # of seasons the show ran for. for row in rows: tds = row.findAll('td') episode = tds[0].getText() match = self._season_re.match(episode) if match is not None: groups = match.groups() season = int(groups[0]) episode = int(groups[1]) if season > highest_season: highest_season = season date = tds[2].getText() match = self._date_re.match(date) if match is not None: year, month, day = match.groups() date = datetime(year=int(year), month=int(month), day=int(day)) if earliest is None or date < earliest: earliest = date if latest is None or date > latest: latest = date if highest_season > 0: entity.num_seasons = highest_season if earliest is not None: entity.earliest_air_date = earliest if latest is not None: entity.latest_air_date = latest except: utils.printException() entity2 = self._thetvdb.lookup(entity.sources.thetvdb_id) if entity2 is not None: if entity2.mpaa_rating is not None: entity.mpaa_rating = entity2.mpaa_rating if entity2.imdb_id is not None: entity.imdb_id = entity2.imdb_id self._output.put(entity)