Python itunes_search示例，peterbecom.podcasttime.scraper.itunes_search Python示例

示例#1

0

显示文件

文件： tasks.py 项目： atopal/django-peterbecom

def fetch_itunes_lookup(podcast_id):
    podcast = Podcast.objects.get(id=podcast_id)
    print("Fetching itunes lookup: {!r}".format(podcast.name))
    results = itunes_search(podcast.name)
    if not results:
        print("Nothing returned on itunes lookup: {!r}".format(podcast.name))
        return
    if results["resultCount"] == 1:
        lookup = results["results"][0]
        podcast.itunes_lookup = lookup
        podcast.save()
    elif results["resultCount"] > 1:
        # Pick the first one if it's a slam dunk
        lookup = results["results"][0]
        if podcast.name.lower() == lookup["collectionName"].lower():
            podcast.itunes_lookup = lookup
            podcast.save()
        elif podcast.url in [x.get("feedUrl") for x in results["results"]]:
            lookup = [
                x for x in results["results"]
                if x.get("feedUrl") == podcast.url
            ][0]
            podcast.itunes_lookup = lookup
            podcast.save()
        else:
            print("Too ambiguous ({!r} != {!r}, {!r} != {!r})".format(
                podcast.name,
                lookup["collectionName"],
                podcast.url,
                lookup.get("feedUrl"),
            ))
    else:
        print("Found no results")

示例#2

0

显示文件

文件： tasks.py 项目： peterbe/django-peterbecom

def fetch_itunes_lookup(podcast_id):
    podcast = Podcast.objects.get(id=podcast_id)
    print("Fetching itunes lookup: {!r}".format(podcast.name))
    results = itunes_search(podcast.name)
    if not results:
        print("Nothing returned on itunes lookup: {!r}".format(podcast.name))
        return
    if results["resultCount"] == 1:
        lookup = results["results"][0]
        podcast.itunes_lookup = lookup
        podcast.save()
    elif results["resultCount"] > 1:
        # Pick the first one if it's a slam dunk
        lookup = results["results"][0]
        if podcast.name.lower() == lookup["collectionName"].lower():
            podcast.itunes_lookup = lookup
            podcast.save()
        elif podcast.url in [x.get("feedUrl") for x in results["results"]]:
            lookup = [x for x in results["results"] if x.get("feedUrl") == podcast.url][
                0
            ]
            podcast.itunes_lookup = lookup
            podcast.save()
        else:
            print(
                "Too ambiguous ({!r} != {!r}, {!r} != {!r})".format(
                    podcast.name,
                    lookup["collectionName"],
                    podcast.url,
                    lookup.get("feedUrl"),
                )
            )
    else:
        print("Found no results")

示例#3

0

显示文件

文件： tasks.py 项目： peterbe/django-peterbecom

def search_by_itunes(q):
    try:
        print("ITUNES SEARCHING {!r}".format(q))
        results = itunes_search(q, attribute="titleTerm", timeout=6)["results"]
    except (ReadTimeout, ConnectTimeout):
        results = []
    print("FOUND {}".format(len(results)))

    count_new = 0
    for result in results[:10]:
        if not result.get("feedUrl"):
            continue
        try:
            podcast = Podcast.objects.get(
                url=result["feedUrl"], name=result["collectionName"]
            )
        except Podcast.DoesNotExist:
            assert result["collectionName"], result
            podcast = Podcast.objects.create(
                name=result["collectionName"],
                url=result["feedUrl"],
                itunes_lookup=result,
                image_url=result["artworkUrl600"],
            )
            try:
                podcast.download_image(timeout=3)
            except (ReadTimeout, ConnectTimeout):
                redownload_podcast_image(podcast.id)
            download_episodes_task(podcast.id)
            count_new += 1

    print("Found {} new podcasts by iTunes search".format(count_new))

示例#4

0

显示文件

文件： tasks.py 项目： catbaxter/django-peterbecom

def search_by_itunes(q):
    try:
        print("ITUNES SEARCHING {!r}".format(q))
        results = itunes_search(q, attribute="titleTerm", timeout=6)["results"]
    except (ReadTimeout, ConnectTimeout):
        results = []
    print("FOUND {}".format(len(results)))

    count_new = 0
    for result in results[:10]:
        if not result.get("feedUrl"):
            continue
        try:
            podcast = Podcast.objects.get(url=result["feedUrl"],
                                          name=result["collectionName"])
        except Podcast.DoesNotExist:
            assert result["collectionName"], result
            podcast = Podcast.objects.create(
                name=result["collectionName"],
                url=result["feedUrl"],
                itunes_lookup=result,
                image_url=result["artworkUrl600"],
            )
            try:
                podcast.download_image(timeout=3)
            except (ReadTimeout, ConnectTimeout):
                redownload_podcast_image(podcast.id)
            download_episodes_task(podcast.id)
            count_new += 1

    print("Found {} new podcasts by iTunes search".format(count_new))

示例#5

0

显示文件

def fetch_itunes_lookup(podcast_id):
    podcast = Podcast.objects.get(id=podcast_id)
    print "Fetching itunes lookup", repr(podcast.name)
    results = itunes_search(podcast.name)
    if results['resultCount'] == 1:
        lookup = results['results'][0]
        podcast.itunes_lookup = lookup
        podcast.save()
    else:
        print "Found", results['resultCount'], 'results'
        print results['resultCount']

示例#6

0

显示文件

def add(request):
    context = {}
    context['page_title'] = 'Add Podcast'

    id = request.GET.get('id', '').strip()
    if id:
        podcast = get_object_or_404(Podcast, id=id)
        if not podcast.image and podcast.image_url:
            podcast.download_image()
        if not Episode.objects.filter(podcast=podcast).exists():
            download_episodes(podcast)
        url = reverse('podcasttime:index') + '#ids={}'.format(podcast.id)
        return redirect(url)

    search = request.GET.get('search', '').strip()
    context['search'] = search
    if search:
        podcasts = []
        matches = itunes_search(search, attribute='titleTerm')
        for result in matches['results']:
            pod = {
                'image_url': result['artworkUrl600'],
                'itunes_url': result['collectionViewUrl'],
                'artist_name': result['artistName'],
                'tags': result['genres'],
                'name': result['collectionName'],
                # 'feed_url': result['feedUrl'],
            }
            try:
                podcast = Podcast.objects.get(
                    url=result['feedUrl'],
                    name=result['collectionName']
                )
            except Podcast.DoesNotExist:
                podcast = Podcast.objects.create(
                    name=result['collectionName'],
                    url=result['feedUrl'],
                    itunes_lookup=result,
                    image_url=result['artworkUrl600'],
                )
                # episodes will be created and downloaded by the cron job
                redownload_podcast_image.delay(podcast.id)
            pod['id'] = podcast.id
            pod['url'] = reverse(
                'podcasttime:podcast_slug',
                args=(podcast.id, podcast.get_or_create_slug())
            )
            podcasts.append(pod)

        context['found'] = matches['resultCount']
        context['podcasts'] = podcasts

    return render(request, 'podcasttime/add.html', context)

示例#7

0

显示文件

文件： tasks.py 项目： atopal/django-peterbecom

def search_by_itunes(q):
    try:
        print("ITUNES SEARCHING {!r}".format(q))
        results = itunes_search(q, attribute="titleTerm", timeout=6)["results"]
    except (ReadTimeout, ConnectTimeout):
        results = []
    print("FOUND {}".format(len(results)))

    count_new = 0
    for result in results[:10]:
        if not result.get("feedUrl"):
            continue

        if (result["collectionName"]
                == "dj-andy-bee Deep n Soulful House # Urban Soul Podcast"
                or result["feedUrl"]
                == "http://dj-andy-b.podOmatic.com/rss2.xml"):
            # https://www.lumendatabase.org/notices/18580243#
            print("REFUSE COLLETION")
            continue
        try:
            podcast = Podcast.objects.get(url=result["feedUrl"],
                                          name=result["collectionName"])
        except Podcast.DoesNotExist:
            assert result["collectionName"], result
            podcast = Podcast.objects.create(
                name=result["collectionName"],
                url=result["feedUrl"],
                itunes_lookup=result,
                image_url=result["artworkUrl600"],
            )
            try:
                podcast.download_image(timeout=3)
            except (ReadTimeout, ConnectTimeout):
                redownload_podcast_image(podcast.id)
            download_episodes_task(podcast.id)
            count_new += 1

    print("Found {} new podcasts by iTunes search".format(count_new))

示例#8

0

显示文件

文件： views.py 项目： peterbe/django-peterbecom

def find(request):
    if not (request.GET.get("ids") or request.GET.get("q")):
        return http.HttpResponseBadRequest("no ids or q")

    found = []
    max_ = 5
    q = None
    total = None

    cutoff = timezone.now() - datetime.timedelta(
        days=settings.LATEST_PODCAST_CUTOFF_DAYS
    )

    def package_podcast(podcast):
        if type(podcast) is Podcast:
            return podcast.to_search_doc()
        else:
            # remove summary fields since it's rather large
            podcast.pop("summary", None)

            if "episodes_count" not in podcast:
                # better than undefined
                podcast["episodes_count"] = None
            podcast["total_hours"] = None
            if podcast.get("episodes_seconds"):
                podcast["total_hours"] = podcast.pop("episodes_seconds") / 3600

            try:
                if podcast["latest_episode"] < cutoff:
                    podcast["_outdated"] = True
                else:
                    podcast["_outdated"] = False
            except KeyError:
                podcast["_outdated"] = True
            return podcast

    if request.GET.get("ids"):
        search = PodcastDoc.search()
        ids = [int(x) for x in request.GET["ids"].split(",")]
        search = search.filter("terms", id=ids)
        response = search.execute()
        if not response.hits.total:
            podcasts_orm = Podcast.objects.filter(id__in=ids)
            for podcast in podcasts_orm.filter(error__isnull=True):
                if not podcast.total_seconds or not podcast.last_fetch:
                    cache_key = "resubmit:{}".format(podcast.id)
                    if not cache.get(cache_key):
                        print(
                            "Forcing {!r} (id={}) to download episodes".format(
                                podcast.name, podcast.id
                            )
                        )
                        download_episodes_task(podcast.id)
                        cache.set(cache_key, True, 60)
                else:
                    podcast.save()
        for hit in response.hits:
            podcast = package_podcast(hit.to_dict())
            if (
                podcast["episodes_count"] is None
                or not podcast.get("last_fetch")
                or podcast["last_fetch"] < (timezone.now() - datetime.timedelta(days=7))
            ):
                cache_key = "resubmit:{}".format(podcast["id"])
                if not cache.get(cache_key):
                    print(
                        "Forcing {!r} (id={}) to download episodes".format(
                            podcast["name"], podcast["id"]
                        )
                    )
                    download_episodes_task(podcast["id"])
                    cache.set(cache_key, True, 60)
                podcast["_updating"] = True
            found.append(podcast)
        # rearrange them in the order they were
        found = sorted(found, key=lambda x: ids.index(x["id"]))

    elif request.GET.get("submitted"):
        q = request.GET["q"]
        try:
            results = itunes_search(q, attribute="titleTerm", timeout=6)["results"]
        except (ReadTimeout, ConnectTimeout):
            results = []

        for result in results[:max_]:
            if not result.get("feedUrl"):
                print("Weird result", result)
                continue
            try:
                podcast = Podcast.objects.get(
                    url=result["feedUrl"], name=result["collectionName"]
                )
            except Podcast.DoesNotExist:
                assert result["collectionName"], result
                podcast = Podcast.objects.create(
                    name=result["collectionName"],
                    url=result["feedUrl"],
                    itunes_lookup=result,
                    image_url=result["artworkUrl600"],
                )
                try:
                    podcast.download_image(timeout=3)
                except (ReadTimeout, ConnectTimeout):
                    redownload_podcast_image(podcast.id)
                download_episodes_task(podcast.id)
                # Reload since the task functions operate on a new instance
                # podcast = Podcast.objects.get(id=podcast.id)
            found.append(package_podcast(podcast))
    else:
        q = request.GET["q"]
        search = PodcastDoc.search()
        search = search.query("match_phrase", name=q)
        search = search[:max_]
        response = search.execute()
        total = response.hits.total
        for hit in response.hits:
            podcast = package_podcast(hit.to_dict())
            if podcast["episodes_count"] is None:
                cache_key = "resubmit:{}".format(podcast["id"])
                if not cache.get(cache_key):
                    print(
                        "Forcing {!r} (id={}) to download episodes".format(
                            podcast["name"], podcast["id"]
                        )
                    )
                    download_episodes_task(podcast["id"])
                    cache.set(cache_key, True, 60)
                    # this will force the client-side to re-query for this
                    podcast["last_fetch"] = None
            found.append(podcast)
        if total < 5:
            print("NOTHING FOUND!", q, "SENDING IT TO iTUNES")
            search_by_itunes(q)

    if total is None:
        total = len(found)
    return http.JsonResponse({"items": found, "total": total, "q": q})

示例#9

0

显示文件

文件： views.py 项目： atopal/django-peterbecom

def find(request):
    if not (request.GET.get("ids") or request.GET.get("q")):
        return http.HttpResponseBadRequest("no ids or q")

    found = []
    max_ = 5
    q = None
    total = None

    cutoff = timezone.now() - datetime.timedelta(
        days=settings.LATEST_PODCAST_CUTOFF_DAYS
    )

    def package_podcast(podcast):
        if type(podcast) is Podcast:
            return podcast.to_search_doc()
        else:
            # remove summary fields since it's rather large
            podcast.pop("summary", None)

            if "episodes_count" not in podcast:
                # better than undefined
                podcast["episodes_count"] = None
            podcast["total_hours"] = None
            if podcast.get("episodes_seconds"):
                podcast["total_hours"] = podcast.pop("episodes_seconds") / 3600

            try:
                if podcast["latest_episode"] < cutoff:
                    podcast["_outdated"] = True
                else:
                    podcast["_outdated"] = False
            except KeyError:
                podcast["_outdated"] = True
            return podcast

    if request.GET.get("ids"):
        search = PodcastDoc.search()
        ids = [int(x) for x in request.GET["ids"].split(",")]
        search = search.filter("terms", id=ids)
        response = search.execute()
        if not response.hits.total:
            podcasts_orm = Podcast.objects.filter(id__in=ids)
            for podcast in podcasts_orm.filter(error__isnull=True):
                if not podcast.total_seconds or not podcast.last_fetch:
                    cache_key = "resubmit:{}".format(podcast.id)
                    if not cache.get(cache_key):
                        print(
                            "Forcing {!r} (id={}) to download episodes".format(
                                podcast.name, podcast.id
                            )
                        )
                        download_episodes_task(podcast.id)
                        cache.set(cache_key, True, 60)
                else:
                    podcast.save()
        for hit in response.hits:
            podcast = package_podcast(hit.to_dict())
            if (
                podcast["episodes_count"] is None
                or not podcast.get("last_fetch")
                or podcast["last_fetch"] < (timezone.now() - datetime.timedelta(days=7))
            ):
                cache_key = "resubmit:{}".format(podcast["id"])
                if not cache.get(cache_key):
                    print(
                        "Forcing {!r} (id={}) to download episodes".format(
                            podcast["name"], podcast["id"]
                        )
                    )
                    download_episodes_task(podcast["id"])
                    cache.set(cache_key, True, 60)
                podcast["_updating"] = True
            found.append(podcast)
        # rearrange them in the order they were
        found = sorted(found, key=lambda x: ids.index(x["id"]))

    elif request.GET.get("submitted"):
        q = request.GET["q"]
        try:
            results = itunes_search(q, attribute="titleTerm", timeout=6)["results"]
        except (ReadTimeout, ConnectTimeout):
            results = []

        for result in results[:max_]:
            if not result.get("feedUrl"):
                print("Weird result", result)
                continue

            if (
                result["collectionName"]
                == "dj-andy-bee Deep n Soulful House # Urban Soul Podcast"
                or result["feedUrl"] == "http://dj-andy-b.podOmatic.com/rss2.xml"
            ):
                # https://www.lumendatabase.org/notices/18580243#
                print("REFUSE COLLETION")
                continue
            try:
                podcast = Podcast.objects.get(
                    url=result["feedUrl"], name=result["collectionName"]
                )
            except Podcast.DoesNotExist:
                assert result["collectionName"], result
                podcast = Podcast.objects.create(
                    name=result["collectionName"],
                    url=result["feedUrl"],
                    itunes_lookup=result,
                    image_url=result["artworkUrl600"],
                )
                try:
                    podcast.download_image(timeout=3)
                except (ReadTimeout, ConnectTimeout):
                    redownload_podcast_image(podcast.id)
                download_episodes_task(podcast.id)
                # Reload since the task functions operate on a new instance
                # podcast = Podcast.objects.get(id=podcast.id)
            found.append(package_podcast(podcast))
    else:
        q = request.GET["q"]
        search = PodcastDoc.search()
        search = search.query("match_phrase", name=q)
        search = search[:max_]
        response = search.execute()
        total = response.hits.total
        for hit in response.hits:
            podcast = package_podcast(hit.to_dict())
            if podcast["episodes_count"] is None:
                cache_key = "resubmit:{}".format(podcast["id"])
                if not cache.get(cache_key):
                    print(
                        "Forcing {!r} (id={}) to download episodes".format(
                            podcast["name"], podcast["id"]
                        )
                    )
                    download_episodes_task(podcast["id"])
                    cache.set(cache_key, True, 60)
                    # this will force the client-side to re-query for this
                    podcast["last_fetch"] = None
            found.append(podcast)
        if total < 5:
            print("NOTHING FOUND!", q, "SENDING IT TO iTUNES")
            search_by_itunes(q)

    if total is None:
        total = len(found)
    return http.JsonResponse({"items": found, "total": total, "q": q})

示例#10

0

显示文件

def find(request):
    if not (request.GET.get('ids') or request.GET.get('q')):
        return http.HttpResponseBadRequest('no ids or q')

    found = []
    max_ = 5
    q = None

    if request.GET.get('ids'):
        ids = [int(x) for x in request.GET['ids'].split(',')]
        found = Podcast.objects.filter(id__in=ids)
        # rearrange them in the order they were
        found = sorted(found, key=lambda x: ids.index(x.id))
        # for podcast in found:
        #     if not podcast.last_fetch:
        #         download_episodes_task.delay(podcast.id)
    elif request.GET.get('itunes'):
        q = request.GET['q']
        try:
            results = itunes_search(
                q,
                attribute='titleTerm',
                timeout=6,
            )['results']
        except (ReadTimeout, ConnectTimeout):
            results = []

        for result in results:
            # pod = {
            #     'image_url': result['artworkUrl600'],
            #     'itunes_url': result['collectionViewUrl'],
            #     'artist_name': result['artistName'],
            #     'tags': result['genres'],
            #     'name': result['collectionName'],
            #     # 'feed_url': result['feedUrl'],
            # }
            try:
                podcast = Podcast.objects.get(
                    url=result['feedUrl'],
                    name=result['collectionName']
                )
            except Podcast.DoesNotExist:
                podcast = Podcast.objects.create(
                    name=result['collectionName'],
                    url=result['feedUrl'],
                    itunes_lookup=result,
                    image_url=result['artworkUrl600'],
                )
                try:
                    podcast.download_image(timeout=3)
                except (ReadTimeout, ConnectTimeout):
                    redownload_podcast_image(podcast.id)
                download_episodes_task.delay(podcast.id)
                # Reload since the task functions operate on a new instance
                # podcast = Podcast.objects.get(id=podcast.id)
            found.append(podcast)
    else:
        q = request.GET['q']
        items = []

        # import time
        # time.sleep(random.randint(1,4))
        base_qs = Podcast.objects.filter(error__isnull=True)
        podcasts = base_qs.filter(name__istartswith=q)
        for podcast in podcasts[:max_]:
            found.append(podcast)
        if len(q) > 2:
            sql = (
                "to_tsvector('english', name) @@ "
                "plainto_tsquery('english', %s)"
            )
            podcasts = base_qs.exclude(
                id__in=[x.id for x in found]
            ).extra(
                where=[sql],
                params=[q]
            )[:max_]
            for podcast in podcasts[:max_]:
                if len(found) >= max_:
                    break
                found.append(podcast)
        if len(q) > 1:
            podcasts = base_qs.filter(name__icontains=q).exclude(
                id__in=[x.id for x in found]
            )
            for podcast in podcasts[:max_]:
                if len(found) >= max_:
                    break
                found.append(podcast)

    def episodes_meta(podcast):
        episodes_cache_key = 'episodes-meta%s' % podcast.id
        meta = cache.get(episodes_cache_key)
        if meta is None:
            episodes = Episode.objects.filter(podcast=podcast)
            episodes_count = episodes.count()
            total_hours = None
            if episodes_count:
                total_seconds = episodes.aggregate(
                    Sum('duration')
                )['duration__sum']
                if total_seconds:
                    total_hours = total_seconds / 3600.0
            else:
                download_episodes_task.delay(podcast.id)
            meta = {
                'count': episodes_count,
                'total_hours': total_hours,
            }
            if episodes_count:
                cache.set(episodes_cache_key, meta, 60 * 60 * 24)
        return meta

    items = []
    for podcast in found:
        if podcast.image and is_html_document(podcast.image.path):
            print "Found a podcast.image that wasn't an image"
            podcast.image = None
            podcast.save()
        if podcast.image:
            if podcast.image.size < 1000:
                print "IMAGE LOOKS SUSPICIOUS"
                print podcast.image_url
                print repr(podcast), podcast.id
                print podcast.url
                print repr(podcast.image.read())
                podcast.download_image()
        thumb_url = None
        if podcast.image:
            try:
                thumb_url = thumbnail(
                    podcast.image,
                    '100x100',
                    quality=81,
                    upscale=False
                ).url
                thumb_url = make_absolute_url(thumb_url, request)
            except IOError:
                import sys
                print "BAD IMAGE!"
                print sys.exc_info()
                print repr(podcast.image)
                print repr(podcast), podcast.url
                print
                podcast.image = None
                podcast.save()
                redownload_podcast_image.delay(podcast.id)
        else:
            redownload_podcast_image.delay(podcast.id)

        # Temporarily put here
        if podcast.itunes_lookup is None:
            fetch_itunes_lookup.delay(podcast.id)

        meta = episodes_meta(podcast)
        episodes_count = meta['count']
        total_hours = meta['total_hours']
        items.append({
            'id': podcast.id,
            'name': podcast.name,
            'image_url': thumb_url,
            'episodes': episodes_count,
            'hours': total_hours,
            'last_fetch': podcast.last_fetch,
            'slug': podcast.get_or_create_slug(),
            'url': reverse(
                'podcasttime:podcast_slug',
                args=(podcast.id, podcast.get_or_create_slug())
            ),
        })
    return http.JsonResponse({
        'items': items,
        'q': q,
    })