Пример #1
0
    def scrape(self):
        thumbnail_url, image_data = self._find_thumbnail_image()
        if not thumbnail_url:
            return None, None, None, None

        # When isolated from the context of a webpage, protocol-relative URLs
        # are ambiguous, so let's absolutify them now.
        if thumbnail_url.startswith('//'):
            thumbnail_url = coerce_url_to_protocol(thumbnail_url, self.protocol)

        if not image_data:
            _, image_data = _fetch_url(thumbnail_url, referer=self.url)

        if not image_data:
            return None, None, None, None

        uid = _filename_from_content(image_data)
        image = str_to_image(image_data)
        storage_url = upload_media(image, category='previews')
        width, height = image.size
        preview_object = {
            'uid': uid,
            'url': storage_url,
            'width': width,
            'height': height,
        }

        thumbnail = _prepare_image(image)

        return thumbnail, preview_object, None, None
Пример #2
0
    def scrape(self):
        thumbnail_url, image_data = self._find_thumbnail_image()
        if not thumbnail_url:
            return None, None, None, None

        # When isolated from the context of a webpage, protocol-relative URLs
        # are ambiguous, so let's absolutify them now.
        if thumbnail_url.startswith('//'):
            thumbnail_url = coerce_url_to_protocol(thumbnail_url, self.protocol)

        if not image_data:
            _, image_data = _fetch_url(thumbnail_url, referer=self.url)

        if not image_data:
            return None, None, None, None

        uid = _filename_from_content(image_data)
        image = str_to_image(image_data)
        storage_url = upload_media(image, category='previews')
        width, height = image.size
        preview_object = {
            'uid': uid,
            'url': storage_url,
            'width': width,
            'height': height,
        }

        thumbnail = _prepare_image(image)

        return thumbnail, preview_object, None, None
Пример #3
0
    def _find_thumbnail_image(self):
        content_type, content = _fetch_url(self.url)

        # if it's an image. it's pretty easy to guess what we should thumbnail.
        if content_type and "image" in content_type and content:
            return self.url

        if content_type and "html" in content_type and content:
            soup = BeautifulSoup.BeautifulSoup(content)
        else:
            return None

        # allow the content author to specify the thumbnail:
        # <meta property="og:image" content="http://...">
        og_image = (soup.find('meta', property='og:image')
                    or soup.find('meta', attrs={'name': 'og:image'}))
        if og_image and og_image['content']:
            return og_image['content']

        # <link rel="image_src" href="http://...">
        thumbnail_spec = soup.find('link', rel='image_src')
        if thumbnail_spec and thumbnail_spec['href']:
            return thumbnail_spec['href']

        # ok, we have no guidance from the author. look for the largest
        # image on the page with a few caveats. (see below)
        max_area = 0
        max_url = None
        for image_url in self._extract_image_urls(soup):
            # When isolated from the context of a webpage, protocol-relative
            # URLs are ambiguous, so let's absolutify them now.
            if image_url.startswith('//'):
                image_url = coerce_url_to_protocol(image_url, self.protocol)
            size = _fetch_image_size(image_url, referer=self.url)
            if not size:
                continue

            area = size[0] * size[1]

            # ignore little images
            if area < 5000:
                g.log.debug('ignore little %s' % image_url)
                continue

            # ignore excessively long/wide images
            if max(size) / min(size) > 1.5:
                g.log.debug('ignore dimensions %s' % image_url)
                continue

            # penalize images with "sprite" in their name
            if 'sprite' in image_url.lower():
                g.log.debug('penalizing sprite %s' % image_url)
                area /= 10

            if area > max_area:
                max_area = area
                max_url = image_url
        return max_url
Пример #4
0
    def _find_thumbnail_image(self):
        content_type, content = _fetch_url(self.url)

        # if it's an image. it's pretty easy to guess what we should thumbnail.
        if content_type and "image" in content_type and content:
            return self.url

        if content_type and "html" in content_type and content:
            soup = BeautifulSoup.BeautifulSoup(content)
        else:
            return None

        # allow the content author to specify the thumbnail:
        # <meta property="og:image" content="http://...">
        og_image = (soup.find('meta', property='og:image') or
                    soup.find('meta', attrs={'name': 'og:image'}))
        if og_image and og_image['content']:
            return og_image['content']

        # <link rel="image_src" href="http://...">
        thumbnail_spec = soup.find('link', rel='image_src')
        if thumbnail_spec and thumbnail_spec['href']:
            return thumbnail_spec['href']

        # ok, we have no guidance from the author. look for the largest
        # image on the page with a few caveats. (see below)
        max_area = 0
        max_url = None
        for image_url in self._extract_image_urls(soup):
            # When isolated from the context of a webpage, protocol-relative
            # URLs are ambiguous, so let's absolutify them now.
            if image_url.startswith('//'):
                image_url = coerce_url_to_protocol(image_url, self.protocol)
            size = _fetch_image_size(image_url, referer=self.url)
            if not size:
                continue

            area = size[0] * size[1]

            # ignore little images
            if area < 5000:
                g.log.debug('ignore little %s' % image_url)
                continue

            # ignore excessively long/wide images
            if max(size) / min(size) > 1.5:
                g.log.debug('ignore dimensions %s' % image_url)
                continue

            # penalize images with "sprite" in their name
            if 'sprite' in image_url.lower():
                g.log.debug('penalizing sprite %s' % image_url)
                area /= 10

            if area > max_area:
                max_area = area
                max_url = image_url
        return max_url
Пример #5
0
    def scrape(self):
        thumbnail_url = self._find_thumbnail_image()
        # When isolated from the context of a webpage, protocol-relative URLs
        # are ambiguous, so let's absolutify them now.
        if thumbnail_url and thumbnail_url.startswith("//"):
            thumbnail_url = coerce_url_to_protocol(thumbnail_url, self.protocol)

        thumbnail = _make_thumbnail_from_url(thumbnail_url, referer=self.url)
        return thumbnail, None, None
Пример #6
0
    def scrape(self):
        thumbnail_url = self._find_thumbnail_image()
        # When isolated from the context of a webpage, protocol-relative URLs
        # are ambiguous, so let's absolutify them now.
        if thumbnail_url and thumbnail_url.startswith('//'):
            thumbnail_url = coerce_url_to_protocol(thumbnail_url,
                                                   self.protocol)

        thumbnail = _make_thumbnail_from_url(thumbnail_url, referer=self.url)
        return thumbnail, None, None
Пример #7
0
    def scrape(self):
        thumbnail_url, image_data = self._find_thumbnail_image()
        if not thumbnail_url:
            return None, None, None, None
        if thumbnail_url.startswith('//'):
            thumbnail_url = coerce_url_to_protocol(thumbnail_url,
                                                   self.protocol)
        if not image_data:
            _, image_data = _fetch_url(thumbnail_url, referer=self.url)
        if not image_data:
            return None, None, None, None

        uid = _filename_from_content(image_data)
        image = str_to_image(image_data)
        storage_url = upload_media(image, category='previews')
        width, height = image.size
        preview_object = {
            'uid': uid,
            'url': storage_url,
            'width': width,
            'height': height,
        }

        thumbnail = _prepare_image(image)
        match = self.URL_MATCH.match(self.url)
        if match and match.group(5):
            self.url = 'https://player.vimeo.com/video/' + match.group(5)
            if match.group(6):
                self.url += match.group(6)

        oembed = {
            'html':
            '<iframe width="640" height="360" style="max-width: 100%;" src="' +
            self.url +
            '" frameborder="0" webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>',
            'width':
            640,
            'height':
            360,
            'thumbnail_url':
            thumbnail_url
        }
        media_object = self._make_media_object(oembed)

        return (
            thumbnail,
            preview_object,
            media_object,
            media_object,
        )
Пример #8
0
    def scrape(self):
        thumbnail_url, image_data = self._find_thumbnail_image()
        if not thumbnail_url:
            return None, None, None, None
        if thumbnail_url.startswith('//'):
            thumbnail_url = coerce_url_to_protocol(thumbnail_url,
                                                   self.protocol)
        if not image_data:
            _, image_data = _fetch_url(thumbnail_url, referer=self.url)
        if not image_data:
            return None, None, None, None

        uid = _filename_from_content(image_data)
        image = str_to_image(image_data)
        storage_url = upload_media(image, category='previews')
        width, height = image.size
        preview_object = {
            'uid': uid,
            'url': storage_url,
            'width': width,
            'height': height,
        }

        thumbnail = _prepare_image(image)

        match = self.URL_MATCH.match(self.url)
        if match and match.group(3):
            self.url = match.group(3)

        oembed = {
            'html':
            '<blockquote class="imgur-embed-pub" lang="en" data-id="a/' +
            self.url + '"><a href="//imgur.com/' + self.url +
            '"></a></blockquote><script async src="//s.imgur.com/min/embed.js" charset="utf-8"></script>',
            'thumbnail_url':
            thumbnail_url
        }
        media_object = self._make_media_object(oembed)

        return (
            thumbnail,
            preview_object,
            media_object,
            media_object,
        )
Пример #9
0
    def scrape(self):
        thumbnail_url, image_data = self._find_thumbnail_image()
        if not thumbnail_url:
            return None, None, None, None
        if thumbnail_url.startswith('//'):
            thumbnail_url = coerce_url_to_protocol(thumbnail_url,
                                                   self.protocol)
        if not image_data:
            _, image_data = _fetch_url(thumbnail_url, referer=self.url)
        if not image_data:
            return None, None, None, None

        uid = _filename_from_content(image_data)
        image = str_to_image(image_data)
        storage_url = upload_media(image, category='previews')
        width, height = image.size
        preview_object = {
            'uid': uid,
            'url': storage_url,
            'width': width,
            'height': height,
        }

        thumbnail = _prepare_image(image)
        self.url = self.url.replace('/videos/watch/', '/videos/embed/')
        oembed = {
            'html':
            '<iframe width="560" height="315" sandbox="allow-same-origin allow-scripts" src="'
            + self.url +
            '" frameborder="0" allowfullscreen style="max-width: 100%;"></iframe>',
            'width':
            560,
            'height':
            315,
            'thumbnail_url':
            thumbnail_url
        }
        media_object = self._make_media_object(oembed)

        return (
            thumbnail,
            preview_object,
            media_object,
            media_object,
        )
Пример #10
0
    def test_coerce_url_to_protocol(self):
        self.assertEquals(
            utils.coerce_url_to_protocol('http://example.com/foo'),
            'http://example.com/foo')

        self.assertEquals(
            utils.coerce_url_to_protocol('https://example.com/foo'),
            'http://example.com/foo')

        self.assertEquals(utils.coerce_url_to_protocol('//example.com/foo'),
                          'http://example.com/foo')

        self.assertEquals(
            utils.coerce_url_to_protocol('http://example.com/foo', 'https'),
            'https://example.com/foo')

        self.assertEquals(
            utils.coerce_url_to_protocol('https://example.com/foo', 'https'),
            'https://example.com/foo')

        self.assertEquals(
            utils.coerce_url_to_protocol('//example.com/foo', 'https'),
            'https://example.com/foo')
Пример #11
0
    def test_coerce_url_to_protocol(self):
        self.assertEquals(
            utils.coerce_url_to_protocol('http://example.com/foo'),
            'http://example.com/foo')

        self.assertEquals(
            utils.coerce_url_to_protocol('https://example.com/foo'),
            'http://example.com/foo')

        self.assertEquals(
            utils.coerce_url_to_protocol('//example.com/foo'),
            'http://example.com/foo')

        self.assertEquals(
            utils.coerce_url_to_protocol('http://example.com/foo', 'https'),
            'https://example.com/foo')

        self.assertEquals(
            utils.coerce_url_to_protocol('https://example.com/foo', 'https'),
            'https://example.com/foo')

        self.assertEquals(
            utils.coerce_url_to_protocol('//example.com/foo', 'https'),
            'https://example.com/foo')
Пример #12
0
    def _find_thumbnail_image(self):
        """Find what we think is the best thumbnail image for a link.

        Returns a 2-tuple of image url and, as an optimization, the raw image
        data.  A value of None for the former means we couldn't find an image;
        None for the latter just means we haven't already fetched the image.
        """
        content_type, content = _fetch_url(self.url)

        # if it's an image, it's pretty easy to guess what we should thumbnail.
        if content_type and "image" in content_type and content:
            return self.url, content

        if content_type and "html" in content_type and content:
            soup = BeautifulSoup.BeautifulSoup(content)
        else:
            return None, None

        # Allow the content author to specify the thumbnail using the Open
        # Graph protocol: http://ogp.me/
        og_image = (soup.find('meta', property='og:image') or
                    soup.find('meta', attrs={'name': 'og:image'}))
        if og_image and og_image.get('content'):
            return og_image['content'], None
        og_image = (soup.find('meta', property='og:image:url') or
                    soup.find('meta', attrs={'name': 'og:image:url'}))
        if og_image and og_image.get('content'):
            return og_image['content'], None

        # <link rel="image_src" href="http://...">
        thumbnail_spec = soup.find('link', rel='image_src')
        if thumbnail_spec and thumbnail_spec['href']:
            return thumbnail_spec['href'], None

        # ok, we have no guidance from the author. look for the largest
        # image on the page with a few caveats. (see below)
        max_area = 0
        max_url = None
        for image_url in self._extract_image_urls(soup):
            # When isolated from the context of a webpage, protocol-relative
            # URLs are ambiguous, so let's absolutify them now.
            if image_url.startswith('//'):
                image_url = coerce_url_to_protocol(image_url, self.protocol)
            size = _fetch_image_size(image_url, referer=self.url)
            if not size:
                continue

            area = size[0] * size[1]

            # ignore little images
            if area < 5000:
                g.log.debug('ignore little %s' % image_url)
                continue

            # ignore excessively long/wide images
            if max(size) / min(size) > 1.5:
                g.log.debug('ignore dimensions %s' % image_url)
                continue

            # penalize images with "sprite" in their name
            if 'sprite' in image_url.lower():
                g.log.debug('penalizing sprite %s' % image_url)
                area /= 10

            if area > max_area:
                max_area = area
                max_url = image_url

        return max_url, None
Пример #13
0
    def _find_thumbnail_image(self):
        """Find what we think is the best thumbnail image for a link.

        Returns a 2-tuple of image url and, as an optimization, the raw image
        data.  A value of None for the former means we couldn't find an image;
        None for the latter just means we haven't already fetched the image.
        """
        content_type, content = _fetch_url(self.url)

        # if it's an image, it's pretty easy to guess what we should thumbnail.
        if content_type and "image" in content_type and content:
            return self.url, content

        if content_type and "html" in content_type and content:
            soup = BeautifulSoup.BeautifulSoup(content)
        else:
            return None, None

        # Allow the content author to specify the thumbnail using the Open
        # Graph protocol: http://ogp.me/
        og_image = (soup.find('meta', property='og:image') or
                    soup.find('meta', attrs={'name': 'og:image'}))
        if og_image and og_image['content']:
            return og_image['content'], None
        og_image = (soup.find('meta', property='og:image:url') or
                    soup.find('meta', attrs={'name': 'og:image:url'}))
        if og_image and og_image['content']:
            return og_image['content'], None

        # <link rel="image_src" href="http://...">
        thumbnail_spec = soup.find('link', rel='image_src')
        if thumbnail_spec and thumbnail_spec['href']:
            return thumbnail_spec['href'], None

        # ok, we have no guidance from the author. look for the largest
        # image on the page with a few caveats. (see below)
        max_area = 0
        max_url = None
        for image_url in self._extract_image_urls(soup):
            # When isolated from the context of a webpage, protocol-relative
            # URLs are ambiguous, so let's absolutify them now.
            if image_url.startswith('//'):
                image_url = coerce_url_to_protocol(image_url, self.protocol)
            size = _fetch_image_size(image_url, referer=self.url)
            if not size:
                continue

            area = size[0] * size[1]

            # ignore little images
            if area < 5000:
                g.log.debug('ignore little %s' % image_url)
                continue

            # ignore excessively long/wide images
            if max(size) / min(size) > 1.5:
                g.log.debug('ignore dimensions %s' % image_url)
                continue

            # penalize images with "sprite" in their name
            if 'sprite' in image_url.lower():
                g.log.debug('penalizing sprite %s' % image_url)
                area /= 10

            if area > max_area:
                max_area = area
                max_url = image_url

        return max_url, None