def _scrape_feed(url, verbose=False): html = download(url, gently=True) doc = pyquery.PyQuery(html) doc.make_links_absolute(get_base_url(url)) print "URL:", url for a in doc('.span3 li a').items(): if a.text() == 'RSS': feed_url = a.attr('href') response = requests.head(feed_url) if response.status_code in (301, 302): feed_url = response.headers['Location'] if Podcast.objects.filter(url=feed_url).exists(): # print "ALREADY HAD", feed_url continue try: image_url = get_image_url(feed_url) except ConnectionError: print('Unable to download image for {}'.format(feed_url)) continue except ExpatError: print('ExpatError when getting image on {}'.format(feed_url)) continue except NotXMLResponse: print( 'NotXMLResponse when getting image on {}'.format(feed_url) ) continue if not image_url: print "Skipping (no image)", feed_url continue assert '://' in image_url, image_url podcast = Podcast.objects.create( url=feed_url, image_url=image_url, ) return podcast # print repr(podcast) podcast.download_image() podcast.download_episodes()
def _scrape_index(url, verbose=False, max_=1000): try: html = download(url, gently=True) except requests_operational_errors: return doc = pyquery.PyQuery(html) links = doc(".thumbnails a") shows = [] for link in links: show_url = link.attrib["href"] show_url = urljoin(url, show_url) link = pyquery.PyQuery(link) for h4 in link.find("h4"): name = h4.text_content() shows.append((name, show_url)) existing_names = Podcast.objects.all().values_list("name", flat=True) # XXX might not keep this shows = [(n, u) for (n, u) in shows if n not in existing_names] random.shuffle(shows) for name, show_url in shows[:max_]: rss_url = _scrape_show(show_url) if not rss_url: print("Skipping", name, show_url) continue image_url = get_image_url(rss_url) if not image_url: print("Skipping (no image)", name, rss_url) continue assert "://" in image_url, image_url podcast, created = Podcast.objects.get_or_create(name=name, url=rss_url) podcast.image_url = image_url podcast.save() # try: # podcast = Podcast.objects.get(name=name) # podcast.url = rss_url # podcast.image_url = image_url # podcast.save() # created = False # except Podcast.DoesNotExist: # assert name, rss_url # podcast = Podcast.objects.create( # name=name, # url=rss_url, # image_url=image_url, # ) # created = True try: podcast.download_image() except (AssertionError, NotAnImageError): if verbose: print("Got an error trying to download the image :(") print("IGNORING AND MOVING ON") PodcastError.create(podcast) if verbose: if created: print("CREATED") else: print("NOT NEW") print(repr(name))
def _scrape_feed(url, tested_urls, verbose=False): html = download(url, gently=True) doc = pyquery.PyQuery(html) doc.make_links_absolute(get_base_url(url)) print("URL:", url) for a in doc(".span3 li a").items(): if a.text() == "RSS": feed_url = a.attr("href") response = requests.head(feed_url) if response.status_code in (301, 302): feed_url = response.headers["Location"] if "://" not in feed_url: feed_url = "http://" + feed_url if feed_url in tested_urls: # We've scraped this one before continue tested_urls.append(feed_url) try: podcast = Podcast.objects.get(url=feed_url) if podcast.name: continue except Podcast.DoesNotExist: pass try: image_url = get_image_url(feed_url) except ConnectionError: print("Unable to download image for {}".format(feed_url)) continue except ExpatError: print("ExpatError when getting image on {}".format(feed_url)) continue except NotXMLResponse: print( "NotXMLResponse when getting image on {}".format(feed_url)) continue if not image_url: print("Skipping (no image)", feed_url) continue if image_url.startswith("//"): if urlparse(feed_url).scheme == "https": image_url = "https:" + image_url else: image_url = "http:" + image_url assert "://" in image_url, image_url podcast, created = Podcast.objects.get_or_create( url=feed_url, image_url=image_url) if not podcast.name: d = feedparser.parse(feed_url) print("STATUS?", d.get("status"), feed_url) if d.get("status") == 404: print("DELETE {} because of 404 status".format(feed_url)) podcast.delete() continue if "title" not in d["feed"]: if not d["feed"] and not d["entries"]: print("DELETE {} becuase not title, feed or " "entries".format(feed_url)) podcast.delete() continue assert d["feed"]["title"], feed_url podcast.name = d["feed"]["title"] podcast.save()
def _scrape_index(url, verbose=False, max_=1000): html = download(url, gently=True) doc = pyquery.PyQuery(html) links = doc('.thumbnails a') shows = [] for link in links: show_url = link.attrib['href'] show_url = urljoin(url, show_url) link = pyquery.PyQuery(link) for h4 in link.find('h4'): name = h4.text_content() shows.append((name, show_url)) existing_names = Podcast.objects.all().values_list('name', flat=True) # XXX might not keep this shows = [ (n, u) for (n, u) in shows if n not in existing_names ] random.shuffle(shows) for name, show_url in shows[:max_]: rss_url = _scrape_show(show_url) if not rss_url: print "Skipping", name, show_url continue image_url = get_image_url(rss_url) if not image_url: print "Skipping (no image)", name, rss_url continue assert '://' in image_url, image_url # print "IMAGE_URL", image_url try: podcast = Podcast.objects.get(name=name) podcast.url = rss_url podcast.image_url = image_url podcast.save() created = False except Podcast.DoesNotExist: podcast = Podcast.objects.create( name=name, url=rss_url, image_url=image_url, ) created = True try: podcast.download_image() except (AssertionError, NotAnImageError): if verbose: print "Got an error trying to download the image :(" print "IGNORING AND MOVING ON" PodcastError.create(podcast) if verbose: if created: print "CREATED", else: print "NOT NEW", print repr(name)
def _scrape_feed(url, tested_urls, verbose=False): html = download(url, gently=True) doc = pyquery.PyQuery(html) doc.make_links_absolute(get_base_url(url)) print("URL:", url) for a in doc(".span3 li a").items(): if a.text() == "RSS": feed_url = a.attr("href") response = requests.head(feed_url) if response.status_code in (301, 302): feed_url = response.headers["Location"] if "://" not in feed_url: feed_url = "http://" + feed_url if feed_url in tested_urls: # We've scraped this one before continue tested_urls.append(feed_url) try: podcast = Podcast.objects.get(url=feed_url) if podcast.name: continue except Podcast.DoesNotExist: pass try: image_url = get_image_url(feed_url) except ConnectionError: print("Unable to download image for {}".format(feed_url)) continue except ExpatError: print("ExpatError when getting image on {}".format(feed_url)) continue except NotXMLResponse: print("NotXMLResponse when getting image on {}".format(feed_url)) continue if not image_url: print("Skipping (no image)", feed_url) continue if image_url.startswith("//"): if urlparse(feed_url).scheme == "https": image_url = "https:" + image_url else: image_url = "http:" + image_url assert "://" in image_url, image_url podcast, created = Podcast.objects.get_or_create( url=feed_url, image_url=image_url ) if not podcast.name: d = feedparser.parse(feed_url) print("STATUS?", d.get("status"), feed_url) if d.get("status") == 404: print("DELETE {} because of 404 status".format(feed_url)) podcast.delete() continue if "title" not in d["feed"]: if not d["feed"] and not d["entries"]: print( "DELETE {} becuase not title, feed or " "entries".format(feed_url) ) podcast.delete() continue assert d["feed"]["title"], feed_url podcast.name = d["feed"]["title"] podcast.save()