Exemplo n.º 1
0
def _scrape_media(url,
                  autoplay=False,
                  maxwidth=600,
                  force=False,
                  use_cache=False,
                  max_cache_age=None):
    media = None
    autoplay = bool(autoplay)
    maxwidth = int(maxwidth)

    # Use media from the cache (if available)
    if not force and use_cache:
        mediaByURL = MediaByURL.get(url,
                                    autoplay=autoplay,
                                    maxwidth=maxwidth,
                                    max_cache_age=max_cache_age)
        if mediaByURL:
            media = mediaByURL.media

    # Otherwise, scrape it
    if not media:
        media_object = secure_media_object = None
        thumbnail_image = thumbnail_url = thumbnail_size = None

        scraper = Scraper.for_url(url, autoplay=autoplay)
        try:
            thumbnail_image, media_object, secure_media_object = (
                scraper.scrape())
        except (HTTPError, URLError) as e:
            if use_cache:
                MediaByURL.add_error(url,
                                     str(e),
                                     autoplay=autoplay,
                                     maxwidth=maxwidth)
            return None

        # the scraper should be able to make a media embed out of the
        # media object it just gave us. if not, null out the media object
        # to protect downstream code
        if media_object and not scraper.media_embed(media_object):
            print "%s made a bad media obj for url %s" % (scraper, url)
            media_object = None

        if (secure_media_object
                and not scraper.media_embed(secure_media_object)):
            print "%s made a bad secure media obj for url %s" % (scraper, url)
            secure_media_object = None

        if thumbnail_image:
            thumbnail_size = thumbnail_image.size
            thumbnail_url = upload_media(thumbnail_image)

        media = Media(media_object, secure_media_object, thumbnail_url,
                      thumbnail_size)

    # Store the media in the cache (if requested), possibly extending the ttl
    if use_cache and media is not ERROR_MEDIA:
        MediaByURL.add(url, media, autoplay=autoplay, maxwidth=maxwidth)

    return media
Exemplo n.º 2
0
def _scrape_media(url, autoplay=False, maxwidth=600, force=False,
                  save_thumbnail=True, use_cache=False, max_cache_age=None):
    media = None
    autoplay = bool(autoplay)
    maxwidth = int(maxwidth)

    # Use media from the cache (if available)
    if not force and use_cache:
        mediaByURL = MediaByURL.get(url,
                                    autoplay=autoplay,
                                    maxwidth=maxwidth,
                                    max_cache_age=max_cache_age)
        if mediaByURL:
            media = mediaByURL.media

    # Otherwise, scrape it
    if not media:
        media_object = secure_media_object = None
        thumbnail_image = thumbnail_url = thumbnail_size = None

        scraper = Scraper.for_url(url, autoplay=autoplay)
        try:
            thumbnail_image, media_object, secure_media_object = (
                scraper.scrape())
        except (HTTPError, URLError) as e:
            if use_cache:
                MediaByURL.add_error(url, str(e),
                                     autoplay=autoplay,
                                     maxwidth=maxwidth)
            return None

        # the scraper should be able to make a media embed out of the
        # media object it just gave us. if not, null out the media object
        # to protect downstream code
        if media_object and not scraper.media_embed(media_object):
            print "%s made a bad media obj for url %s" % (scraper, url)
            media_object = None

        if (secure_media_object and
            not scraper.media_embed(secure_media_object)):
            print "%s made a bad secure media obj for url %s" % (scraper, url)
            secure_media_object = None

        if thumbnail_image and save_thumbnail:
            thumbnail_size = thumbnail_image.size
            thumbnail_url = upload_media(thumbnail_image)

        media = Media(media_object, secure_media_object,
                      thumbnail_url, thumbnail_size)

    # Store the media in the cache (if requested), possibly extending the ttl
    use_cache = use_cache and save_thumbnail    # don't cache partial scrape
    if use_cache and media is not ERROR_MEDIA:
        MediaByURL.add(url,
                       media,
                       autoplay=autoplay,
                       maxwidth=maxwidth)

    return media
Exemplo n.º 3
0
def _scrape_media(url, autoplay=False, maxwidth=600, force=False,
                  save_thumbnail=True, use_cache=False, max_cache_age=None,
                  use_youtube_scraper=False):
    media = None
    autoplay = bool(autoplay)
    maxwidth = int(maxwidth)

    # Use media from the cache (if available)
    if not force and use_cache:
        mediaByURL = MediaByURL.get(url,
                                    autoplay=autoplay,
                                    maxwidth=maxwidth,
                                    max_cache_age=max_cache_age)
        if mediaByURL:
            media = mediaByURL.media

    # Otherwise, scrape it if thumbnail is not present
    if not media or not media.thumbnail_url:
        media_object = secure_media_object = None
        thumbnail_image = thumbnail_url = thumbnail_size = None

        scraper = Scraper.for_url(url, autoplay=autoplay,
                                  use_youtube_scraper=use_youtube_scraper)
        try:
            thumbnail_image, preview_object, media_object, secure_media_object = (
                scraper.scrape())
        except (HTTPError, URLError) as e:
            if use_cache:
                MediaByURL.add_error(url, str(e),
                                     autoplay=autoplay,
                                     maxwidth=maxwidth)
            return None

        # the scraper should be able to make a media embed out of the
        # media object it just gave us. if not, null out the media object
        # to protect downstream code
        if media_object and not scraper.media_embed(media_object):
            print "%s made a bad media obj for url %s" % (scraper, url)
            media_object = None

        if (secure_media_object and
            not scraper.media_embed(secure_media_object)):
            print "%s made a bad secure media obj for url %s" % (scraper, url)
            secure_media_object = None

        # If thumbnail can't be found, attempt again using _ThumbnailOnlyScraper
        # This should fix bugs that occur when embed.ly caches links before the 
        # thumbnail is available
        if (not thumbnail_image and 
                not isinstance(scraper, _ThumbnailOnlyScraper)):
            scraper = _ThumbnailOnlyScraper(url)
            try:
                thumbnail_image, preview_object, _, _ = scraper.scrape()
            except (HTTPError, URLError) as e:
                use_cache = False

        if thumbnail_image and save_thumbnail:
            thumbnail_size = thumbnail_image.size
            thumbnail_url = upload_media(thumbnail_image)
        else:
            # don't cache if thumbnail is absent
            use_cache = False

        media = Media(media_object, secure_media_object, preview_object,
                      thumbnail_url, thumbnail_size)

    if use_cache and save_thumbnail and media is not ERROR_MEDIA:
        # Store the media in the cache, possibly extending the ttl
        MediaByURL.add(url,
                       media,
                       autoplay=autoplay,
                       maxwidth=maxwidth)

    return media
Exemplo n.º 4
0
def _scrape_media(url, autoplay=False, maxwidth=600, force=False,
                  save_thumbnail=True, use_cache=False, max_cache_age=None):
    media = None
    autoplay = bool(autoplay)
    maxwidth = int(maxwidth)

    # Use media from the cache (if available)
    if not force and use_cache:
        mediaByURL = MediaByURL.get(url,
                                    autoplay=autoplay,
                                    maxwidth=maxwidth,
                                    max_cache_age=max_cache_age)
        if mediaByURL:
            media = mediaByURL.media

    # Otherwise, scrape it
    if not media:
        media_object = secure_media_object = None
        thumbnail_image = thumbnail_url = thumbnail_size = None

        scraper = Scraper.for_url(url, autoplay=autoplay)
        try:
            thumbnail_image, preview_object, media_object, secure_media_object = (
                scraper.scrape())
        except (HTTPError, URLError) as e:
            if use_cache:
                MediaByURL.add_error(url, str(e),
                                     autoplay=autoplay,
                                     maxwidth=maxwidth)
            return None

        # the scraper should be able to make a media embed out of the
        # media object it just gave us. if not, null out the media object
        # to protect downstream code
        if media_object and not scraper.media_embed(media_object):
            print "%s made a bad media obj for url %s" % (scraper, url)
            media_object = None

        if (secure_media_object and
            not scraper.media_embed(secure_media_object)):
            print "%s made a bad secure media obj for url %s" % (scraper, url)
            secure_media_object = None

        # If thumbnail can't be found, attempt again using _ThumbnailOnlyScraper
        # This should fix bugs that occur when embed.ly caches links before the 
        # thumbnail is available
        if (not thumbnail_image and 
                not isinstance(scraper, _ThumbnailOnlyScraper)):
            scraper = _ThumbnailOnlyScraper(url)
            try:
                thumbnail_image, preview_object, _, _ = scraper.scrape()
            except (HTTPError, URLError) as e:
                use_cache = False

        if thumbnail_image and save_thumbnail:
            thumbnail_size = thumbnail_image.size
            thumbnail_url = upload_media(thumbnail_image)
        else:
            # don't cache if thumbnail is absent
            use_cache = False

        media = Media(media_object, secure_media_object, preview_object,
                      thumbnail_url, thumbnail_size)

    if use_cache and save_thumbnail and media is not ERROR_MEDIA:
        # Store the media in the cache, possibly extending the ttl
        MediaByURL.add(url,
                       media,
                       autoplay=autoplay,
                       maxwidth=maxwidth)

    return media
Exemplo n.º 5
0
        if not good_preview_object(link.preview_object):
            continue
        if not link.preview_object == preview_object:
            print "  aborting - preview objects don't match"
            print '    first: %s' % preview_object
            print '    ours:  %s' % link.preview_object
            continue

        link.preview_object['url'] = url
        link._commit()
        # Guess at the key that'll contain the (now-incorrect) cache of the
        # preview object so we can delete it and not end up inserting old info
        # into new Links.
        #
        # These parameters are what's used in most of the code; the only place
        # they're overridden is for promoted links, where they could be
        # anything.  We'll just have to deal with those as they come up.
        image_url = _get_scrape_url(link)
        cache_key = MediaByURL._rowkey(image_url, autoplay=False, maxwidth=600)
        print '  deleting cache with key %s' % cache_key
        cache = MediaByURL(_id=cache_key)
        cache._committed = True
        try:
            cache._destroy()
        except pycassa.cassandra.ttypes.InvalidRequestException as e:
            print '    skipping cache deletion (%s)' % e.why
            continue
    # Delete *after* we've updated all the Links so they'll continue to work
    # while we're in the migration process.
    k.delete()
Exemplo n.º 6
0
        if not good_preview_object(link.preview_object):
            continue
        if not link.preview_object == preview_object:
            print "  aborting - preview objects don't match"
            print '    first: %s' % preview_object
            print '    ours:  %s' % link.preview_object
            continue

        link.preview_object['url'] = url
        link._commit()
        # Guess at the key that'll contain the (now-incorrect) cache of the
        # preview object so we can delete it and not end up inserting old info
        # into new Links.
        #
        # These parameters are what's used in most of the code; the only place
        # they're overridden is for promoted links, where they could be
        # anything.  We'll just have to deal with those as they come up.
        image_url = _get_scrape_url(link)
        cache_key = MediaByURL._rowkey(image_url, autoplay=False, maxwidth=600)
        print '  deleting cache with key %s' % cache_key
        cache = MediaByURL(_id=cache_key)
        cache._committed = True
        try:
            cache._destroy()
        except pycassa.cassandra.ttypes.InvalidRequestException as e:
            print '    skipping cache deletion (%s)' % e.why
            continue
    # Delete *after* we've updated all the Links so they'll continue to work
    # while we're in the migration process.
    k.delete()