def get_series_info(self): """ Gets the Series info for a TVShow and puts it into the GAE Datastore. The information which is scraped includes show rating, show name, show description and show status. """ if not self.disbable_tvshow_scraping: if not self.rescrape: # Generate the fanart URL if fanart exists. Used to generate the Hexagon image for the show fanart_url = TVDB_BANNER_URL + self.tvdbsoup.fanart.text if self.tvdbsoup.fanart.text else None else: show = TVShow.get_by_key_name(self.tvdb_id) fanart_url = show.fanart # Identify the show genres genres = self.tvdbsoup.Genre.text.strip("|").split("|") # Find the number of seasons num_of_seasons = ( int(self.tvdbsoup.find_all("SeasonNumber")[-1].text) if self.tvdbsoup.find_all("SeasonNumber") else 0 ) # If this is a new scrape and the number of seasons is greated than 10 disable # episode rating scraping to conserve app engine quota if not self.rescrape and num_of_seasons > 10: self.disable_episode_ratings = 1 self.options_array[1] = 1 # Put the scraped information into the GAE datastore tv_show = TVShow( key_name=self.tvdb_id, title=self.tvdbsoup.SeriesName.text, desc=self.tvdbsoup.Overview.text, rating=float(self.get_imdb_rating(self.tvdbsoup.IMDB_ID.text)), fanart=fanart_url, genre=genres[0] if len(genres) > 0 else None, subgenre=genres[1] if len(genres) > 1 else None, status=self.tvdbsoup.Status.text, imdb_id=self.tvdbsoup.IMDB_ID.text, url_string=self.slug, last_scraped=datetime.utcfromtimestamp(0), num_seasons=num_of_seasons, ).put() # Obtain the key for the TVShow self.series_key = tv_show else: # Get the series key from the datastore self.series_key = TVShow.get_by_key_name(self.tvdb_id) fanart_url = self.series_key.fanart # If fanart exists generate a hexagon image and store in the datastore if not self.disbable_fanart_scraping and fanart_url: hexagon_image = Hexagon(fanart_url).get_hex() # Check if the hexagon is valid if hexagon_image: HexImages(parent=self.series_key, key_name=self.tvdb_id, image=db.Blob(hexagon_image)).put()
def email_update(request): """ A function which finds all the users who are subscribed to shows airing this week and for each user, generates an email with a list of these shows and when they're airing :param request: The request object for the page :return: A HttpResponse Object, which renders a page specifying how many emails were sent """ # Get all the users q = db.GqlQuery("SELECT * FROM User") # Calculate 7 days from now to use in the query weektoday = date.today() + timedelta(days=7) messages_sent = 0 for user in q.run(): # For each user get the shows they're subscribed to show_query = db.GqlQuery("SELECT * FROM UserShow WHERE user_id = :id", id=user.key().name()) show_ids = [str(show.show_id) for show in show_query.run()] # Don't send email if they're not subscribed to any shows if len(show_ids) == 0: continue shows = TVShow.get_by_key_name(show_ids) # For all the shows subscribed to - find the shows which have episodes airing this week episodes_this_week = {date.today() + timedelta(days=k) : [] for k in range(0, 7)} ep_this_week = False for showid in shows: episodes_query = db.GqlQuery( "SELECT * FROM TVEpisode WHERE airdate >= :today AND airdate < :weektoday AND ANCESTOR IS :showid ORDER BY airdate", today=date.today(), weektoday=weektoday, showid=showid) # Create a list of all the episodes episodes = [episode for episode in episodes_query.run()] # Map the date for an episode to a dictionary containing the show title, the episode name # and the season and episode number for episode in episodes: ep_this_week = True episodes_this_week[episode.airdate].append({'show_title' : showid.title, 'ep_name' : episode.name, 'season_num' : episode.season, 'ep_num': episode.ep_number }) # Don't send email if there are no episode airing this week if not ep_this_week: continue # Construct a message containing the episodes for this week message = "Hello Telehex Subscriber,\n\nHere are your shows airing this week:\n\n" for key in sorted(episodes_this_week): if len(episodes_this_week[key]) == 0: continue message += "{0}:\n".format(key.strftime("%B %d, %Y")) for show_ep in episodes_this_week[key]: message += "\t{0} - {1} (S{2:02d}E{3:02d})\n".format(show_ep['show_title'], show_ep['ep_name'], show_ep['season_num'], show_ep['ep_num']) message += "\n\n" # Get the server to send the mail mail.send_mail(sender="*****@*****.**", to="{0}".format(user.email), subject="Telehex - Your weekly episode email", body=message) messages_sent += 1 # Task complete, return a response with the number of messages sent return render(request, 'telehex/email_update.html', {"messages_sent": messages_sent})
def __init__(self, tvdb_id, rescrape=False, options="00000000", update_options=True): """ The :class:Scraper class constructor. Takes a tvdb_id and initialises the scraping for the TVShow and the TVEpisodes :param tvdb_id: The tvdb id was the show to be scraped. :param options: The options param is used to specify options for the scraping and display of a show Each character index of the string represents a specific option. These are 0. Disable Scraping: 0 = scraping enabled, 1 = scraping disabled 1. Disable Episode Ratings Scraping: 0 = episode scraping enabled, 1 = episode scraping disabled 2. Disable Fanart Scraping: 0 = fanart scraping enabled, 1 = fanart scraping disabled 3. Disable TVShow Scraping 0 = TVShow scraping enabled, 1 = TVShow scraping disabled 4. Disable TVEpisode Scraping 0 = TVEpisode scraping enabled, 1 = TVEpisode scraping disabled 5. Disable Episode Description Display 0 = display episode desc, 1 = don't display episode desc 6. Disable Episode Display 0 = display episodes, 1 = don't display episodes 7. Reserved """ self.series_key = None self.rating = -1 # Determine if this is a new scrape or a rescrape self.rescrape = rescrape # If this is a first scrape then set options to default if not self.rescrape: self.options = "00000000" else: self.options = options # Create the options array self.options_array = map(int, list(options)) # Specify the options relevant to the scraping self.disable_scraping = self.options_array[0] self.disable_episode_ratings = self.options_array[1] self.disbable_fanart_scraping = self.options_array[2] self.disbable_tvshow_scraping = self.options_array[3] self.disbable_tvepisode_scraping = self.options_array[4] # If the scraping isn't disable do this if not self.disable_scraping: # Increase the timeout for fetching a url - required for large shows urlfetch.set_default_fetch_deadline(60) self.tvdb_id = tvdb_id # Fetch the XML from tvdb and turn into a BeautifulSoup Object self.tvdbxml = urllib2.urlopen( "http://thetvdb.com/api/{0}/series/{1}/all/en.xml".format(API_KEY, self.tvdb_id) ) self.tvdbsoup = BeautifulSoup(self.tvdbxml.read(), "xml") # Generate the show slug for the show, e.g. Breaking Bad becomes breaking_bad exclude_chars = set(string.punctuation) self.slug = "".join(char for char in self.tvdbsoup.SeriesName.text if char not in exclude_chars) self.slug = re.sub(r"\W+", "_", self.slug.lower()) # Perform the scraping for the TVShow self.get_series_info() if not self.disbable_tvepisode_scraping: # Perform the scraping for the TVEpisodes self.get_episode_info() # Specify when the show was last scraped q = TVShow.get_by_key_name(self.tvdb_id) q.last_scraped = datetime.now() if update_options: q.options = "".join(str(x) for x in self.options_array) q.put()