示例#1
0
    def test_missing_title(self) -> None:
        html = b"""
        <html>
        <body>
        Some text.
        </body>
        </html>
        """

        tree = decode_body(html, "http://example.com/test.html")
        og = parse_html_to_open_graph(tree)

        self.assertEqual(og, {
            "og:title": None,
            "og:description": "Some text."
        })

        # Another variant is a title with no content.
        html = b"""
        <html>
        <head><title></title></head>
        <body>
        <h1>Title</h1>
        </body>
        </html>
        """

        tree = decode_body(html, "http://example.com/test.html")
        og = parse_html_to_open_graph(tree)

        self.assertEqual(og, {"og:title": "Title", "og:description": "Title"})
    def test_comment2(self):
        html = b"""
        <html>
        <head><title>Foo</title></head>
        <body>
        Some text.
        <!-- HTML comment -->
        Some more text.
        <p>Text</p>
        More text
        </body>
        </html>
        """

        tree = decode_body(html, "http://example.com/test.html")
        og = parse_html_to_open_graph(tree, "http://example.com/test.html")

        self.assertEqual(
            og,
            {
                "og:title":
                "Foo",
                "og:description":
                "Some text.\n\nSome more text.\n\nText\n\nMore text",
            },
        )
示例#3
0
    def test_twitter_tag(self) -> None:
        """Twitter card tags should be used if nothing else is available."""
        html = b"""
        <html>
        <meta name="twitter:card" content="summary">
        <meta name="twitter:description" content="Description">
        <meta name="twitter:site" content="@matrixdotorg">
        </html>
        """
        tree = decode_body(html, "http://example.com/test.html")
        og = parse_html_to_open_graph(tree)
        self.assertEqual(
            og,
            {
                "og:title": None,
                "og:description": "Description",
                "og:site_name": "@matrixdotorg",
            },
        )

        # But they shouldn't override Open Graph values.
        html = b"""
        <html>
        <meta name="twitter:card" content="summary">
        <meta name="twitter:description" content="Description">
        <meta property="og:description" content="Real Description">
        <meta name="twitter:site" content="@matrixdotorg">
        <meta property="og:site_name" content="matrix.org">
        </html>
        """
        tree = decode_body(html, "http://example.com/test.html")
        og = parse_html_to_open_graph(tree)
        self.assertEqual(
            og,
            {
                "og:title": None,
                "og:description": "Real Description",
                "og:site_name": "matrix.org",
            },
        )
 def test_windows_1252(self):
     """A body which uses cp1252, but doesn't declare that."""
     html = b"""
     <html>
     <head><title>\xf3</title></head>
     <body>
     Some text.
     </body>
     </html>
     """
     tree = decode_body(html, "http://example.com/test.html")
     og = parse_html_to_open_graph(tree, "http://example.com/test.html")
     self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})
示例#5
0
 def test_invalid_encoding(self) -> None:
     """An invalid character encoding should be ignored and treated as UTF-8, if possible."""
     html = b"""
     <html>
     <head><title>Foo</title></head>
     <body>
     Some text.
     </body>
     </html>
     """
     tree = decode_body(html, "http://example.com/test.html", "invalid-encoding")
     og = parse_html_to_open_graph(tree)
     self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
示例#6
0
 def test_invalid_encoding2(self) -> None:
     """A body which doesn't match the sent character encoding."""
     # Note that this contains an invalid UTF-8 sequence in the title.
     html = b"""
     <html>
     <head><title>\xff\xff Foo</title></head>
     <body>
     Some text.
     </body>
     </html>
     """
     tree = decode_body(html, "http://example.com/test.html")
     og = parse_html_to_open_graph(tree)
     self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
示例#7
0
    def test_xml(self) -> None:
        """Test decoding XML and ensure it works properly."""
        # Note that the strip() call is important to ensure the xml tag starts
        # at the initial byte.
        html = b"""
        <?xml version="1.0" encoding="UTF-8"?>

        <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
        <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
        <head><title>Foo</title></head><body>Some text.</body></html>
        """.strip()
        tree = decode_body(html, "http://example.com/test.html")
        og = parse_html_to_open_graph(tree)
        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
示例#8
0
    def test_missing_title_and_broken_h1(self) -> None:
        html = b"""
        <html>
        <body>
        <h1><a href="foo"/></h1>
        Some text.
        </body>
        </html>
        """

        tree = decode_body(html, "http://example.com/test.html")
        og = parse_html_to_open_graph(tree)

        self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
示例#9
0
    def test_h1_as_title(self) -> None:
        html = b"""
        <html>
        <meta property="og:description" content="Some text."/>
        <body>
        <h1>Title</h1>
        </body>
        </html>
        """

        tree = decode_body(html, "http://example.com/test.html")
        og = parse_html_to_open_graph(tree)

        self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
示例#10
0
    def test_simple(self) -> None:
        html = b"""
        <html>
        <head><title>Foo</title></head>
        <body>
        Some text.
        </body>
        </html>
        """

        tree = decode_body(html, "http://example.com/test.html")
        og = parse_html_to_open_graph(tree)

        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
    def test_missing_title(self):
        html = b"""
        <html>
        <body>
        Some text.
        </body>
        </html>
        """

        tree = decode_body(html, "http://example.com/test.html")
        og = parse_html_to_open_graph(tree, "http://example.com/test.html")

        self.assertEqual(og, {
            "og:title": None,
            "og:description": "Some text."
        })
示例#12
0
 def test_nested_nodes(self) -> None:
     """A body with some nested nodes. Tests that we iterate over children
     in the right order (and don't reverse the order of the text)."""
     html = b"""
     <a href="somewhere">Welcome <b>the bold <u>and underlined text <svg>
     with a cheeky SVG</svg></u> and <strong>some</strong> tail text</b></a>
     """
     tree = decode_body(html, "http://example.com/test.html")
     og = parse_html_to_open_graph(tree)
     self.assertEqual(
         og,
         {
             "og:title": None,
             "og:description": "Welcome\n\nthe bold\n\nand underlined text\n\nand\n\nsome\n\ntail text",
         },
     )
示例#13
0
    def test_empty_description(self) -> None:
        """Description tags with empty content should be ignored."""
        html = b"""
        <html>
        <meta property="og:description" content=""/>
        <meta property="og:description"/>
        <meta name="description" content=""/>
        <meta name="description"/>
        <meta name="description" content="Finally!"/>
        <body>
        <h1>Title</h1>
        </body>
        </html>
        """

        tree = decode_body(html, "http://example.com/test.html")
        og = parse_html_to_open_graph(tree)

        self.assertEqual(og, {"og:title": "Title", "og:description": "Finally!"})
    async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
        """Check the db, and download the URL and build a preview

        Args:
            url: The URL to preview.
            user: The user requesting the preview.
            ts: The timestamp requested for the preview.

        Returns:
            json-encoded og data
        """
        # check the URL cache in the DB (which will also provide us with
        # historical previews, if we have any)
        cache_result = await self.store.get_url_cache(url, ts)
        if (
            cache_result
            and cache_result["expires_ts"] > ts
            and cache_result["response_code"] / 100 == 2
        ):
            # It may be stored as text in the database, not as bytes (such as
            # PostgreSQL). If so, encode it back before handing it on.
            og = cache_result["og"]
            if isinstance(og, str):
                og = og.encode("utf8")
            return og

        # If this URL can be accessed via oEmbed, use that instead.
        url_to_download = url
        oembed_url = self._oembed.get_oembed_url(url)
        if oembed_url:
            url_to_download = oembed_url

        media_info = await self._handle_url(url_to_download, user)

        logger.debug("got media_info of '%s'", media_info)

        # The number of milliseconds that the response should be considered valid.
        expiration_ms = media_info.expires
        author_name: Optional[str] = None

        if _is_media(media_info.media_type):
            file_id = media_info.filesystem_id
            dims = await self.media_repo._generate_thumbnails(
                None, file_id, file_id, media_info.media_type, url_cache=True
            )

            og = {
                "og:description": media_info.download_name,
                "og:image": f"mxc://{self.server_name}/{media_info.filesystem_id}",
                "og:image:type": media_info.media_type,
                "matrix:image:size": media_info.media_length,
            }

            if dims:
                og["og:image:width"] = dims["width"]
                og["og:image:height"] = dims["height"]
            else:
                logger.warning("Couldn't get dims for %s" % url)

            # define our OG response for this media
        elif _is_html(media_info.media_type):
            # TODO: somehow stop a big HTML tree from exploding synapse's RAM

            with open(media_info.filename, "rb") as file:
                body = file.read()

            tree = decode_body(body, media_info.uri, media_info.media_type)
            if tree is not None:
                # Check if this HTML document points to oEmbed information and
                # defer to that.
                oembed_url = self._oembed.autodiscover_from_html(tree)
                og_from_oembed: JsonDict = {}
                if oembed_url:
                    oembed_info = await self._handle_url(
                        oembed_url, user, allow_data_urls=True
                    )
                    (
                        og_from_oembed,
                        author_name,
                        expiration_ms,
                    ) = await self._handle_oembed_response(
                        url, oembed_info, expiration_ms
                    )

                # Parse Open Graph information from the HTML in case the oEmbed
                # response failed or is incomplete.
                og_from_html = parse_html_to_open_graph(tree, media_info.uri)

                # Compile the Open Graph response by using the scraped
                # information from the HTML and overlaying any information
                # from the oEmbed response.
                og = {**og_from_html, **og_from_oembed}

                await self._precache_image_url(user, media_info, og)
            else:
                og = {}

        elif oembed_url:
            # Handle the oEmbed information.
            og, author_name, expiration_ms = await self._handle_oembed_response(
                url, media_info, expiration_ms
            )
            await self._precache_image_url(user, media_info, og)

        else:
            logger.warning("Failed to find any OG data in %s", url)
            og = {}

        # If we don't have a title but we have author_name, copy it as
        # title
        if not og.get("og:title") and author_name:
            og["og:title"] = author_name

        # filter out any stupidly long values
        keys_to_remove = []
        for k, v in og.items():
            # values can be numeric as well as strings, hence the cast to str
            if len(k) > OG_TAG_NAME_MAXLEN or len(str(v)) > OG_TAG_VALUE_MAXLEN:
                logger.warning(
                    "Pruning overlong tag %s from OG data", k[:OG_TAG_NAME_MAXLEN]
                )
                keys_to_remove.append(k)
        for k in keys_to_remove:
            del og[k]

        logger.debug("Calculated OG for %s as %s", url, og)

        jsonog = json_encoder.encode(og)

        # Cap the amount of time to consider a response valid.
        expiration_ms = min(expiration_ms, ONE_DAY)

        # store OG in history-aware DB cache
        await self.store.store_url_cache(
            url,
            media_info.response_code,
            media_info.etag,
            media_info.created_ts_ms + expiration_ms,
            jsonog,
            media_info.filesystem_id,
            media_info.created_ts_ms,
        )

        return jsonog.encode("utf8")
 def test_no_tree(self):
     """A valid body with no tree in it."""
     html = b"\x00"
     tree = decode_body(html, "http://example.com/test.html")
     self.assertIsNone(tree)
 def test_empty(self):
     """Test a body with no data in it."""
     html = b""
     tree = decode_body(html, "http://example.com/test.html")
     self.assertIsNone(tree)