def download_episodes(podcast, verbose=True): try: _download_episodes(podcast, verbose=verbose) if podcast.error: Podcast.objects.filter(id=podcast.id).update(error=None) except (BadPodcastEntry, NotFound) as exception: Podcast.objects.filter(id=podcast.id).update( error=unicode(exception) ) except Exception: PodcastError.create(podcast) raise
def download_episodes_task(podcast_id, verbose=True): try: podcast = Podcast.objects.get(id=podcast_id) except Podcast.DoesNotExist: print("Warning! Podcast with id {} does not exist".format(podcast_id)) return try: download_episodes(podcast, verbose=verbose) except NotFound as exception: PodcastError.create(podcast) if isinstance(exception, bytes): podcast.error = exception.decode("utf-8") else: podcast.error = str(exception) podcast.save()
def redownload_podcast_image(podcast_id): podcast = Podcast.objects.get(id=podcast_id) try: podcast.download_image() # If it worked, it should be possible to make a thumbnail out of # if. I've seen downloaded images with the right content-type, # and with a size but when you try to turn it into a thumbnail # PIL throws IOErrors. assert podcast.image try: thumbnail(podcast.image, "300x300") print("Worked!") except IOError: print("Not a valid image if thumbnails can't be made") podcast.image = None podcast.save() except Exception: print("Failed!") PodcastError.create(podcast) raise
def _scrape_index(url, verbose=False, max_=1000): try: html = download(url, gently=True) except requests_operational_errors: return doc = pyquery.PyQuery(html) links = doc(".thumbnails a") shows = [] for link in links: show_url = link.attrib["href"] show_url = urljoin(url, show_url) link = pyquery.PyQuery(link) for h4 in link.find("h4"): name = h4.text_content() shows.append((name, show_url)) existing_names = Podcast.objects.all().values_list("name", flat=True) # XXX might not keep this shows = [(n, u) for (n, u) in shows if n not in existing_names] random.shuffle(shows) for name, show_url in shows[:max_]: rss_url = _scrape_show(show_url) if not rss_url: print("Skipping", name, show_url) continue image_url = get_image_url(rss_url) if not image_url: print("Skipping (no image)", name, rss_url) continue assert "://" in image_url, image_url podcast, created = Podcast.objects.get_or_create(name=name, url=rss_url) podcast.image_url = image_url podcast.save() # try: # podcast = Podcast.objects.get(name=name) # podcast.url = rss_url # podcast.image_url = image_url # podcast.save() # created = False # except Podcast.DoesNotExist: # assert name, rss_url # podcast = Podcast.objects.create( # name=name, # url=rss_url, # image_url=image_url, # ) # created = True try: podcast.download_image() except (AssertionError, NotAnImageError): if verbose: print("Got an error trying to download the image :(") print("IGNORING AND MOVING ON") PodcastError.create(podcast) if verbose: if created: print("CREATED") else: print("NOT NEW") print(repr(name))
def _download_episodes(podcast, verbose=True, timeout=10): assert podcast.name, podcast.id xml = download(podcast.url, timeout=timeout) d = feedparser.parse(xml) def get_duration(entry): if not entry.get("itunes_duration"): try: for link in entry["links"]: if link["type"] == "audio/mpeg" or link["href"].lower( ).endswith(".mp3"): duration, error = parse_duration_ffmpeg(link["href"]) if error: raise BadEpisodeDurationError(error) return duration except KeyError: try: print(entry.enclosure) raise Exception(entry.enclosure) except AttributeError: # no 'itunes:duration' and no links print("SKIPPING", entry) return elif entry["itunes_duration"].count(":") >= 1: try: itunes_duration = entry["itunes_duration"] # a bug in bad podcasts itunes_duration = itunes_duration.replace(">", "") itunes_duration = itunes_duration.replace(";", "") itunes_duration = [ int(float(x)) for x in itunes_duration.split(":") if x.strip() ] except ValueError: print("SKIPPING, BAD itunes_duration") print(entry) print("itunes_duration=", repr(entry["itunes_duration"])) return duration = 0 itunes_duration.reverse() duration += itunes_duration[0] # seconds if len(itunes_duration) > 1: duration += 60 * itunes_duration[1] # minutes if len(itunes_duration) > 2: duration += 60 * 60 * itunes_duration[2] # hours if duration > 24 * 60 * 60: entry["itunes_duration"] = None return get_duration(entry) return duration else: if not entry["itunes_duration"]: print("BUT!", xml.find("<itunes:duration")) return try: return int(float(entry["itunes_duration"])) except ValueError: # pprint(entry) print("SKIPPING itunes_duration not a number") print(repr(entry["itunes_duration"])) return for entry in d["entries"]: if not entry.get("published_parsed"): # print "Entry without a valid 'published_parsed'!" # print entry raise BadPodcastEntry( "Entry without a valid 'published_parsed'! ({})".format( podcast.url)) published = datetime.datetime.fromtimestamp( time.mktime(entry["published_parsed"])) if published.tzinfo is None: published = published.replace(tzinfo=timezone.utc) duration = get_duration(entry) if duration is None: continue try: guid = entry.guid except AttributeError: try: guid = entry.id except AttributeError: print("No guid or id. Going to use the summary.") try: guid = hashlib.md5( entry.summary.encode("utf-8")).hexdigest() except AttributeError: print("No guid or id or summary. ", "Going to use the title.") guid = hashlib.md5(entry.title.encode("utf-8")).hexdigest() # raise try: Episode.objects.get(podcast=podcast, guid=guid) # if ep.duration != duration: # print("DURATION CHANGED!!!") # else: # print("Duration unchanged") # if ep.published != published: # print("PUBLISHED CHANGED!!!") # else: # print("Published unchanged") except Episode.DoesNotExist: pass metadata = dict(entry) title = strip_tags(metadata.get("title")) summary = strip_tags(metadata.get("summary")) try: episode = Episode.objects.get(podcast=podcast, guid=guid) episode.duration = duration episode.published = published episode.metadata = metadata episode.title = title episode.summary = summary try: episode.save() # print("SAVED") except DataError: print("FROM", podcast.url) print("ENTRY") print(entry) print("TRIED TO SAVE DURATION", duration) PodcastError.create(podcast, notes="Tried to save duration") raise except Episode.DoesNotExist: episode = Episode.objects.create( podcast=podcast, duration=duration, published=published, guid=guid, metadata=metadata, title=title, summary=summary, ) print("CREATED episode") print((episode.podcast.name, episode.guid, episode.duration, episode.published)) print("SETTING last_fetch ON {!r}".format(podcast)) latest_episode = Episode.objects.filter(podcast=podcast).aggregate( latest=Max("published"))["latest"] print("SETTING latest_episode {!r}".format(latest_episode)) # print(dir(podcast)) podcast.refresh_from_db() # podcast = Podcast.objects.get(id=podcast.id) podcast.last_fetch = timezone.now() podcast.latest_episode = latest_episode podcast.save()
def _download_episodes(podcast, verbose=True): xml = download(podcast.url) d = feedparser.parse(xml) def get_duration(entry): if not entry.get('itunes_duration'): try: for link in entry['links']: if ( link['type'] == 'audio/mpeg' or link['href'].lower().endswith('.mp3') ): return parse_duration_ffmpeg( link['href'] ) except KeyError: try: print entry.enclosure raise Exception(entry.enclosure) except AttributeError: # no 'itunes:duration' and no links print "SKIPPING", entry return elif entry['itunes_duration'].count(':') >= 1: try: itunes_duration = entry['itunes_duration'] # a bug in bad podcasts itunes_duration = itunes_duration.replace('>', '') itunes_duration = itunes_duration.replace(';', '') itunes_duration = [ int(float(x)) for x in itunes_duration.split(':') if x.strip() ] except ValueError: print "SKIPPING, BAD itunes_duration" print entry print 'itunes_duration=', repr(entry['itunes_duration']) return duration = 0 itunes_duration.reverse() duration += itunes_duration[0] # seconds if len(itunes_duration) > 1: duration += 60 * itunes_duration[1] # minutes if len(itunes_duration) > 2: duration += 60 * 60 * itunes_duration[2] # hours if duration > 24 * 60 * 60: entry['itunes_duration'] = None return get_duration(entry) return duration else: if not entry['itunes_duration']: print "BUT!", xml.find('<itunes:duration') return try: return int(float(entry['itunes_duration'])) except ValueError: # pprint(entry) print "SKIPPING itunes_duration not a number" print repr(entry['itunes_duration']) return for entry in d['entries']: if not entry.get('published_parsed'): # print "Entry without a valid 'published_parsed'!" # print entry raise BadPodcastEntry( "Entry without a valid 'published_parsed'! ({})".format( podcast.url ) ) published = datetime.datetime.fromtimestamp( time.mktime(entry['published_parsed']) ) if published.tzinfo is None: published = published.replace(tzinfo=timezone.utc) duration = get_duration(entry) if duration is None: continue try: guid = entry.guid except AttributeError: try: guid = entry.id except AttributeError: print "No guid or id. Going to use the summary." try: guid = hashlib.md5( entry.summary.encode('utf-8') ).hexdigest() except AttributeError: print "No guid or id or summary. ", print "Going to use the title." guid = hashlib.md5( entry.title.encode('utf-8') ).hexdigest() # raise try: ep = Episode.objects.get( podcast=podcast, guid=guid ) if ep.duration != duration: print "DURATION CHANGED!!!" else: print "Duration unchanged" if ep.published != published: print "PUBLISHED CHANGED!!!" else: print "Published unchanged" except Episode.DoesNotExist: pass try: episode = Episode.objects.get( podcast=podcast, guid=guid ) episode.duration = duration episode.published = published try: episode.save() print "SAVED", except DataError: print "FROM", podcast.url print "ENTRY" print entry print "TRIED TO SAVE DURATION", duration PodcastError.create(podcast, notes='Tried to save duration') raise except Episode.DoesNotExist: episode = Episode.objects.create( podcast=podcast, duration=duration, published=published, guid=guid, ) print "CREATED", print ( episode.podcast.name, episode.guid, episode.duration, episode.published ) print("SETTING last_fetch ON {!r}".format(podcast)) Podcast.objects.filter(id=podcast.id).update(last_fetch=timezone.now())
def _scrape_index(url, verbose=False, max_=1000): html = download(url, gently=True) doc = pyquery.PyQuery(html) links = doc('.thumbnails a') shows = [] for link in links: show_url = link.attrib['href'] show_url = urljoin(url, show_url) link = pyquery.PyQuery(link) for h4 in link.find('h4'): name = h4.text_content() shows.append((name, show_url)) existing_names = Podcast.objects.all().values_list('name', flat=True) # XXX might not keep this shows = [ (n, u) for (n, u) in shows if n not in existing_names ] random.shuffle(shows) for name, show_url in shows[:max_]: rss_url = _scrape_show(show_url) if not rss_url: print "Skipping", name, show_url continue image_url = get_image_url(rss_url) if not image_url: print "Skipping (no image)", name, rss_url continue assert '://' in image_url, image_url # print "IMAGE_URL", image_url try: podcast = Podcast.objects.get(name=name) podcast.url = rss_url podcast.image_url = image_url podcast.save() created = False except Podcast.DoesNotExist: podcast = Podcast.objects.create( name=name, url=rss_url, image_url=image_url, ) created = True try: podcast.download_image() except (AssertionError, NotAnImageError): if verbose: print "Got an error trying to download the image :(" print "IGNORING AND MOVING ON" PodcastError.create(podcast) if verbose: if created: print "CREATED", else: print "NOT NEW", print repr(name)
def _download_episodes(podcast, verbose=True, timeout=10): assert podcast.name, podcast.id xml = download(podcast.url, timeout=timeout) d = feedparser.parse(xml) def get_duration(entry): if not entry.get("itunes_duration"): try: for link in entry["links"]: if link["type"] == "audio/mpeg" or link["href"].lower().endswith( ".mp3" ): duration, error = parse_duration_ffmpeg(link["href"]) if error: raise BadEpisodeDurationError(error) return duration except KeyError: try: print(entry.enclosure) raise Exception(entry.enclosure) except AttributeError: # no 'itunes:duration' and no links print("SKIPPING", entry) return elif entry["itunes_duration"].count(":") >= 1: try: itunes_duration = entry["itunes_duration"] # a bug in bad podcasts itunes_duration = itunes_duration.replace(">", "") itunes_duration = itunes_duration.replace(";", "") itunes_duration = [ int(float(x)) for x in itunes_duration.split(":") if x.strip() ] except ValueError: print("SKIPPING, BAD itunes_duration") print(entry) print("itunes_duration=", repr(entry["itunes_duration"])) return duration = 0 itunes_duration.reverse() duration += itunes_duration[0] # seconds if len(itunes_duration) > 1: duration += 60 * itunes_duration[1] # minutes if len(itunes_duration) > 2: duration += 60 * 60 * itunes_duration[2] # hours if duration > 24 * 60 * 60: entry["itunes_duration"] = None return get_duration(entry) return duration else: if not entry["itunes_duration"]: print("BUT!", xml.find("<itunes:duration")) return try: return int(float(entry["itunes_duration"])) except ValueError: # pprint(entry) print("SKIPPING itunes_duration not a number") print(repr(entry["itunes_duration"])) return for entry in d["entries"]: if not entry.get("published_parsed"): # print "Entry without a valid 'published_parsed'!" # print entry raise BadPodcastEntry( "Entry without a valid 'published_parsed'! ({})".format(podcast.url) ) published = datetime.datetime.fromtimestamp( time.mktime(entry["published_parsed"]) ) if published.tzinfo is None: published = published.replace(tzinfo=timezone.utc) duration = get_duration(entry) if duration is None: continue try: guid = entry.guid except AttributeError: try: guid = entry.id except AttributeError: print("No guid or id. Going to use the summary.") try: guid = hashlib.md5(entry.summary.encode("utf-8")).hexdigest() except AttributeError: print("No guid or id or summary. ", "Going to use the title.") guid = hashlib.md5(entry.title.encode("utf-8")).hexdigest() # raise try: Episode.objects.get(podcast=podcast, guid=guid) # if ep.duration != duration: # print("DURATION CHANGED!!!") # else: # print("Duration unchanged") # if ep.published != published: # print("PUBLISHED CHANGED!!!") # else: # print("Published unchanged") except Episode.DoesNotExist: pass metadata = dict(entry) title = strip_tags(metadata.get("title")) summary = strip_tags(metadata.get("summary")) try: episode = Episode.objects.get(podcast=podcast, guid=guid) episode.duration = duration episode.published = published episode.metadata = metadata episode.title = title episode.summary = summary try: episode.save() # print("SAVED") except DataError: print("FROM", podcast.url) print("ENTRY") print(entry) print("TRIED TO SAVE DURATION", duration) PodcastError.create(podcast, notes="Tried to save duration") raise except Episode.DoesNotExist: episode = Episode.objects.create( podcast=podcast, duration=duration, published=published, guid=guid, metadata=metadata, title=title, summary=summary, ) print("CREATED episode") print((episode.podcast.name, episode.guid, episode.duration, episode.published)) print("SETTING last_fetch ON {!r}".format(podcast)) latest_episode = Episode.objects.filter(podcast=podcast).aggregate( latest=Max("published") )["latest"] print("SETTING latest_episode {!r}".format(latest_episode)) # print(dir(podcast)) podcast.refresh_from_db() # podcast = Podcast.objects.get(id=podcast.id) podcast.last_fetch = timezone.now() podcast.latest_episode = latest_episode podcast.save()