Exemplo n.º 1
0
def find_podcasts(url, verbose=False, depth=0):
    urls = []
    hash_ = hashlib.md5(get_base_url(url)).hexdigest()
    print(url, hash_, depth)
    if hash_ == '73eb773086aa7f75654f4a2d25ca315b':
        if not depth:
            url = url + '/feeds'
        html = download(url)
        doc = pyquery.PyQuery(html)
        doc.make_links_absolute(base_url=get_base_url(url))
        for a in doc('h3 a').items():
            if a.text() == 'Join Now to Follow':
                continue
            # print (a.attr('href'), a.text())
            urls.append(a.attr('href'))
        max_ = 10
        random.shuffle(urls)
        for url in urls[:max_]:
            try:
                _scrape_feed(
                    url,
                    verbose=verbose,
                )
            except NotFound:
                print("WARNING Can't find {}".format(url))
        # Now find the next pages
        if depth < 5:
            next_urls = []
            for a in doc('.pagination a').items():
                if '?page=' in a.attr('href'):
                    next_urls.append(a.attr('href'))
            random.shuffle(urls)
            for next_url in next_urls[:max_]:
                for podcast in find_podcasts(
                    next_url,
                    verbose=verbose,
                    depth=depth + 1
                ):
                    yield podcast
    else:
        html = download(url)
        doc = pyquery.PyQuery(html)
        doc.make_links_absolute(base_url=get_base_url(url))
        for a in doc('ul.nav ul.dropdown-menu li a'):
            href = a.attrib['href']
            if '/browse/' in href:
                urls.append(url)

        max_ = 10
        random.shuffle(urls)
        for url in urls[:max_]:
            yield _scrape_index(
                url,
                verbose=verbose,
                max_=max_,
            )
Exemplo n.º 2
0
def find_podcasts(url, verbose=False, depth=0, tested_urls=None):
    urls = []
    hash_ = hashlib.md5(get_base_url(url).encode("utf-8")).hexdigest()
    print((url, hash_, depth))
    if tested_urls is None:
        tested_urls = []  # a mutable
    if hash_ == "73eb773086aa7f75654f4a2d25ca315b":
        if not depth:
            url = url + "/feeds"
        html = download(url)
        doc = pyquery.PyQuery(html)
        doc.make_links_absolute(base_url=get_base_url(url))
        for a in doc("h3 a").items():
            if a.text() == "Join Now to Follow":
                continue
            urls.append(a.attr("href"))
        max_ = 10
        random.shuffle(urls)
        for url in urls[:max_]:
            try:
                _scrape_feed(url, tested_urls, verbose=verbose)
            except NotFound:
                print("WARNING Can't find {}".format(url))
        # Now find the next pages
        if depth < 5:
            next_urls = []
            for a in doc(".pagination a").items():
                if "?page=" in a.attr("href"):
                    next_urls.append(a.attr("href"))
            random.shuffle(urls)
            for next_url in next_urls[:max_]:
                for podcast in find_podcasts(next_url,
                                             verbose=verbose,
                                             depth=depth + 1,
                                             tested_urls=tested_urls):
                    yield podcast
    else:
        try:
            html = download(url)
        except ConnectionError:
            return
        doc = pyquery.PyQuery(html)
        doc.make_links_absolute(base_url=get_base_url(url))
        for a in doc("ul.nav ul.dropdown-menu li a"):
            href = a.attrib["href"]
            if "/browse/" in href:
                urls.append(url)

        max_ = 10
        random.shuffle(urls)
        for url in urls[:max_]:
            yield _scrape_index(url, verbose=verbose, max_=max_)
Exemplo n.º 3
0
def find_podcasts(url, verbose=False, depth=0, tested_urls=None):
    urls = []
    hash_ = hashlib.md5(get_base_url(url).encode("utf-8")).hexdigest()
    print((url, hash_, depth))
    if tested_urls is None:
        tested_urls = []  # a mutable
    if hash_ == "73eb773086aa7f75654f4a2d25ca315b":
        if not depth:
            url = url + "/feeds"
        html = download(url)
        doc = pyquery.PyQuery(html)
        doc.make_links_absolute(base_url=get_base_url(url))
        for a in doc("h3 a").items():
            if a.text() == "Join Now to Follow":
                continue
            urls.append(a.attr("href"))
        max_ = 10
        random.shuffle(urls)
        for url in urls[:max_]:
            try:
                _scrape_feed(url, tested_urls, verbose=verbose)
            except NotFound:
                print("WARNING Can't find {}".format(url))
        # Now find the next pages
        if depth < 5:
            next_urls = []
            for a in doc(".pagination a").items():
                if "?page=" in a.attr("href"):
                    next_urls.append(a.attr("href"))
            random.shuffle(urls)
            for next_url in next_urls[:max_]:
                for podcast in find_podcasts(
                    next_url, verbose=verbose, depth=depth + 1, tested_urls=tested_urls
                ):
                    yield podcast
    else:
        try:
            html = download(url)
        except ConnectionError:
            return
        doc = pyquery.PyQuery(html)
        doc.make_links_absolute(base_url=get_base_url(url))
        for a in doc("ul.nav ul.dropdown-menu li a"):
            href = a.attrib["href"]
            if "/browse/" in href:
                urls.append(url)

        max_ = 10
        random.shuffle(urls)
        for url in urls[:max_]:
            yield _scrape_index(url, verbose=verbose, max_=max_)
Exemplo n.º 4
0
def _scrape_feed(url, verbose=False):
    html = download(url, gently=True)
    doc = pyquery.PyQuery(html)
    doc.make_links_absolute(get_base_url(url))
    print "URL:", url
    for a in doc('.span3 li a').items():
        if a.text() == 'RSS':
            feed_url = a.attr('href')
            response = requests.head(feed_url)
            if response.status_code in (301, 302):
                feed_url = response.headers['Location']
            if Podcast.objects.filter(url=feed_url).exists():
                # print "ALREADY HAD", feed_url
                continue
            try:
                image_url = get_image_url(feed_url)
            except ConnectionError:
                print('Unable to download image for {}'.format(feed_url))
                continue
            except ExpatError:
                print('ExpatError when getting image on {}'.format(feed_url))
                continue
            except NotXMLResponse:
                print(
                    'NotXMLResponse when getting image on {}'.format(feed_url)
                )
                continue
            if not image_url:
                print "Skipping (no image)", feed_url
                continue
            assert '://' in image_url, image_url
            podcast = Podcast.objects.create(
                url=feed_url,
                image_url=image_url,
            )
            return podcast
            # print repr(podcast)
            podcast.download_image()
            podcast.download_episodes()
Exemplo n.º 5
0
def _scrape_feed(url, tested_urls, verbose=False):
    html = download(url, gently=True)
    doc = pyquery.PyQuery(html)
    doc.make_links_absolute(get_base_url(url))
    print("URL:", url)
    for a in doc(".span3 li a").items():
        if a.text() == "RSS":
            feed_url = a.attr("href")
            response = requests.head(feed_url)
            if response.status_code in (301, 302):
                feed_url = response.headers["Location"]
            if "://" not in feed_url:
                feed_url = "http://" + feed_url
            if feed_url in tested_urls:
                # We've scraped this one before
                continue
            tested_urls.append(feed_url)
            try:
                podcast = Podcast.objects.get(url=feed_url)
                if podcast.name:
                    continue
            except Podcast.DoesNotExist:
                pass
            try:
                image_url = get_image_url(feed_url)
            except ConnectionError:
                print("Unable to download image for {}".format(feed_url))
                continue
            except ExpatError:
                print("ExpatError when getting image on {}".format(feed_url))
                continue
            except NotXMLResponse:
                print(
                    "NotXMLResponse when getting image on {}".format(feed_url))
                continue
            if not image_url:
                print("Skipping (no image)", feed_url)
                continue
            if image_url.startswith("//"):
                if urlparse(feed_url).scheme == "https":
                    image_url = "https:" + image_url
                else:
                    image_url = "http:" + image_url
            assert "://" in image_url, image_url
            podcast, created = Podcast.objects.get_or_create(
                url=feed_url, image_url=image_url)
            if not podcast.name:
                d = feedparser.parse(feed_url)
                print("STATUS?", d.get("status"), feed_url)
                if d.get("status") == 404:
                    print("DELETE {} because of 404 status".format(feed_url))
                    podcast.delete()
                    continue
                if "title" not in d["feed"]:
                    if not d["feed"] and not d["entries"]:
                        print("DELETE {} becuase not title, feed or "
                              "entries".format(feed_url))
                        podcast.delete()
                        continue
                assert d["feed"]["title"], feed_url
                podcast.name = d["feed"]["title"]
                podcast.save()
Exemplo n.º 6
0
def _scrape_feed(url, tested_urls, verbose=False):
    html = download(url, gently=True)
    doc = pyquery.PyQuery(html)
    doc.make_links_absolute(get_base_url(url))
    print("URL:", url)
    for a in doc(".span3 li a").items():
        if a.text() == "RSS":
            feed_url = a.attr("href")
            response = requests.head(feed_url)
            if response.status_code in (301, 302):
                feed_url = response.headers["Location"]
            if "://" not in feed_url:
                feed_url = "http://" + feed_url
            if feed_url in tested_urls:
                # We've scraped this one before
                continue
            tested_urls.append(feed_url)
            try:
                podcast = Podcast.objects.get(url=feed_url)
                if podcast.name:
                    continue
            except Podcast.DoesNotExist:
                pass
            try:
                image_url = get_image_url(feed_url)
            except ConnectionError:
                print("Unable to download image for {}".format(feed_url))
                continue
            except ExpatError:
                print("ExpatError when getting image on {}".format(feed_url))
                continue
            except NotXMLResponse:
                print("NotXMLResponse when getting image on {}".format(feed_url))
                continue
            if not image_url:
                print("Skipping (no image)", feed_url)
                continue
            if image_url.startswith("//"):
                if urlparse(feed_url).scheme == "https":
                    image_url = "https:" + image_url
                else:
                    image_url = "http:" + image_url
            assert "://" in image_url, image_url
            podcast, created = Podcast.objects.get_or_create(
                url=feed_url, image_url=image_url
            )
            if not podcast.name:
                d = feedparser.parse(feed_url)
                print("STATUS?", d.get("status"), feed_url)
                if d.get("status") == 404:
                    print("DELETE {} because of 404 status".format(feed_url))
                    podcast.delete()
                    continue
                if "title" not in d["feed"]:
                    if not d["feed"] and not d["entries"]:
                        print(
                            "DELETE {} becuase not title, feed or "
                            "entries".format(feed_url)
                        )
                        podcast.delete()
                        continue
                assert d["feed"]["title"], feed_url
                podcast.name = d["feed"]["title"]
                podcast.save()