Exemplo n.º 1
0
    def store_request(read):
        """Store a readable document based off a ReadableRequest instance."""
        url = read.final_url
        hash_id = generate_hash(url)

        content = Article(read.content, url=url)
        readable_article = content.readable

        try:
            readable_title = content._original_document.title
        except AttributeError as exc:
            LOG.error(str(exc))
            readable_title = 'Unknown'

        # Json encoding a requests response breaks due to the CaseInsitiveDict
        # in use.
        request_info = {
            'content_type': read.content_type,
            'domain': read.domain,
            'final_url': read.final_url,
            'headers': dict(read.headers),
            'is_error': read.is_error,
            'request_time': read.request_time,
            'start_time': str(read.start_time),
            'status_code': read.status_code,
            'status_message': read.status_message,
            'url': read.url,
        }

        page = WebPage(
            hash_id=hash_id,
            readable=readable_article,
            request=request_info,
            title=readable_title,
            url=url,
        )

        server.set(hash_id, json.dumps(dict(page)))

        # If the url and the final url are not the same then store an extra
        # record pointing the original url to the final url record.
        if read.url != read.final_url:
            server.set(
                generate_hash(read.url),
                json.dumps({
                    'reference': hash_id
                })
            )

        return page
Exemplo n.º 2
0
    def test_cached_webpage(self):
        """When we readable parse we cache the data in redis."""
        url = 'http://www.google.com/intl/en/about/index.html'
        hashed = generate_hash(url)
        resp = self.app.get(
            '/v',
            params={
                'url': url
            },
            status=302)

        # follow the redirect and we land at the actual page.
        resp = resp.follow()

        from bookie_parser.models import server
        # Make sure the data exists in redis
        self.assertTrue(server.get(hashed), 'The key is found.')

        # Now hit up our redis server and find what data we've stored.
        data = WebPageMgr.get(hash_id=hashed)

        self.assertEqual(
            url, data.url,
            "The url is stored in the root object")
        self.assertEqual(
            hashed, data.hash_id,
            "The hash is stored in the root object")
        self.assertTrue(
            data.request is not None,
            'The request is stored in the cache.')
        self.assertEqual(
            u'Google  - About Google',
            data.title)
        self.assertTrue(data.readable is not None)
Exemplo n.º 3
0
    def exists(hash_id=None, url=None):
        if hash_id is None and url is not None:
            url = url.strip('/')
            hash_id = generate_hash(url)

        if server.exists(hash_id):
            return hash_id
        else:
            return None
Exemplo n.º 4
0
    def test_viewable_response(self):
        """Make sure we can load and get a html response correctly."""
        url = 'http://www.google.com/intl/en/about/index.html'
        hashed = generate_hash(url)
        resp = self.app.get(
            '/v',
            params={
                'url': url
            },
            status=302)

        # follow the redirect and we land at the actual page.
        resp = resp.follow()
        body = resp.body.decode('utf8')

        self.assertTrue(
            resp.request.url.endswith(hashed),
            'the url should end with the url hash')
        self.assertIn(
            "google.com", body,
            'we should find google in the body. ' + body)
Exemplo n.º 5
0
    def __init__(self, hash_id=None, url=None, readable=None,
                 title=None, request=None, final_url=None):
        """Create a new WebPage data instance."""
        if url:
            self.url = url
            if not hash_id:
                # Generate a new hash id
                self.hash_id = generate_hash(url)

        if hash_id:
            self.hash_id = hash_id
        if url:
            self.url = url
        if readable:
            self.readable = readable
        if request:
            self.request = request
        if self.request['is_error']:
            self.is_error = True
        if final_url:
            self.final_url = final_url

        if title:
            self.title = title