def test_missing_title(self) -> None: html = b""" <html> <body> Some text. </body> </html> """ tree = decode_body(html, "http://example.com/test.html") og = parse_html_to_open_graph(tree) self.assertEqual(og, { "og:title": None, "og:description": "Some text." }) # Another variant is a title with no content. html = b""" <html> <head><title></title></head> <body> <h1>Title</h1> </body> </html> """ tree = decode_body(html, "http://example.com/test.html") og = parse_html_to_open_graph(tree) self.assertEqual(og, {"og:title": "Title", "og:description": "Title"})
def test_comment2(self): html = b""" <html> <head><title>Foo</title></head> <body> Some text. <!-- HTML comment --> Some more text. <p>Text</p> More text </body> </html> """ tree = decode_body(html, "http://example.com/test.html") og = parse_html_to_open_graph(tree, "http://example.com/test.html") self.assertEqual( og, { "og:title": "Foo", "og:description": "Some text.\n\nSome more text.\n\nText\n\nMore text", }, )
def test_twitter_tag(self) -> None: """Twitter card tags should be used if nothing else is available.""" html = b""" <html> <meta name="twitter:card" content="summary"> <meta name="twitter:description" content="Description"> <meta name="twitter:site" content="@matrixdotorg"> </html> """ tree = decode_body(html, "http://example.com/test.html") og = parse_html_to_open_graph(tree) self.assertEqual( og, { "og:title": None, "og:description": "Description", "og:site_name": "@matrixdotorg", }, ) # But they shouldn't override Open Graph values. html = b""" <html> <meta name="twitter:card" content="summary"> <meta name="twitter:description" content="Description"> <meta property="og:description" content="Real Description"> <meta name="twitter:site" content="@matrixdotorg"> <meta property="og:site_name" content="matrix.org"> </html> """ tree = decode_body(html, "http://example.com/test.html") og = parse_html_to_open_graph(tree) self.assertEqual( og, { "og:title": None, "og:description": "Real Description", "og:site_name": "matrix.org", }, )
def test_windows_1252(self): """A body which uses cp1252, but doesn't declare that.""" html = b""" <html> <head><title>\xf3</title></head> <body> Some text. </body> </html> """ tree = decode_body(html, "http://example.com/test.html") og = parse_html_to_open_graph(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})
def test_invalid_encoding(self) -> None: """An invalid character encoding should be ignored and treated as UTF-8, if possible.""" html = b""" <html> <head><title>Foo</title></head> <body> Some text. </body> </html> """ tree = decode_body(html, "http://example.com/test.html", "invalid-encoding") og = parse_html_to_open_graph(tree) self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
def test_invalid_encoding2(self) -> None: """A body which doesn't match the sent character encoding.""" # Note that this contains an invalid UTF-8 sequence in the title. html = b""" <html> <head><title>\xff\xff Foo</title></head> <body> Some text. </body> </html> """ tree = decode_body(html, "http://example.com/test.html") og = parse_html_to_open_graph(tree) self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
def test_xml(self) -> None: """Test decoding XML and ensure it works properly.""" # Note that the strip() call is important to ensure the xml tag starts # at the initial byte. html = b""" <?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> <head><title>Foo</title></head><body>Some text.</body></html> """.strip() tree = decode_body(html, "http://example.com/test.html") og = parse_html_to_open_graph(tree) self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
def test_missing_title_and_broken_h1(self) -> None: html = b""" <html> <body> <h1><a href="foo"/></h1> Some text. </body> </html> """ tree = decode_body(html, "http://example.com/test.html") og = parse_html_to_open_graph(tree) self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
def test_h1_as_title(self) -> None: html = b""" <html> <meta property="og:description" content="Some text."/> <body> <h1>Title</h1> </body> </html> """ tree = decode_body(html, "http://example.com/test.html") og = parse_html_to_open_graph(tree) self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
def test_simple(self) -> None: html = b""" <html> <head><title>Foo</title></head> <body> Some text. </body> </html> """ tree = decode_body(html, "http://example.com/test.html") og = parse_html_to_open_graph(tree) self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
def test_missing_title(self): html = b""" <html> <body> Some text. </body> </html> """ tree = decode_body(html, "http://example.com/test.html") og = parse_html_to_open_graph(tree, "http://example.com/test.html") self.assertEqual(og, { "og:title": None, "og:description": "Some text." })
def test_nested_nodes(self) -> None: """A body with some nested nodes. Tests that we iterate over children in the right order (and don't reverse the order of the text).""" html = b""" <a href="somewhere">Welcome <b>the bold <u>and underlined text <svg> with a cheeky SVG</svg></u> and <strong>some</strong> tail text</b></a> """ tree = decode_body(html, "http://example.com/test.html") og = parse_html_to_open_graph(tree) self.assertEqual( og, { "og:title": None, "og:description": "Welcome\n\nthe bold\n\nand underlined text\n\nand\n\nsome\n\ntail text", }, )
def test_empty_description(self) -> None: """Description tags with empty content should be ignored.""" html = b""" <html> <meta property="og:description" content=""/> <meta property="og:description"/> <meta name="description" content=""/> <meta name="description"/> <meta name="description" content="Finally!"/> <body> <h1>Title</h1> </body> </html> """ tree = decode_body(html, "http://example.com/test.html") og = parse_html_to_open_graph(tree) self.assertEqual(og, {"og:title": "Title", "og:description": "Finally!"})
async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes: """Check the db, and download the URL and build a preview Args: url: The URL to preview. user: The user requesting the preview. ts: The timestamp requested for the preview. Returns: json-encoded og data """ # check the URL cache in the DB (which will also provide us with # historical previews, if we have any) cache_result = await self.store.get_url_cache(url, ts) if ( cache_result and cache_result["expires_ts"] > ts and cache_result["response_code"] / 100 == 2 ): # It may be stored as text in the database, not as bytes (such as # PostgreSQL). If so, encode it back before handing it on. og = cache_result["og"] if isinstance(og, str): og = og.encode("utf8") return og # If this URL can be accessed via oEmbed, use that instead. url_to_download = url oembed_url = self._oembed.get_oembed_url(url) if oembed_url: url_to_download = oembed_url media_info = await self._handle_url(url_to_download, user) logger.debug("got media_info of '%s'", media_info) # The number of milliseconds that the response should be considered valid. expiration_ms = media_info.expires author_name: Optional[str] = None if _is_media(media_info.media_type): file_id = media_info.filesystem_id dims = await self.media_repo._generate_thumbnails( None, file_id, file_id, media_info.media_type, url_cache=True ) og = { "og:description": media_info.download_name, "og:image": f"mxc://{self.server_name}/{media_info.filesystem_id}", "og:image:type": media_info.media_type, "matrix:image:size": media_info.media_length, } if dims: og["og:image:width"] = dims["width"] og["og:image:height"] = dims["height"] else: logger.warning("Couldn't get dims for %s" % url) # define our OG response for this media elif _is_html(media_info.media_type): # TODO: somehow stop a big HTML tree from exploding synapse's RAM with open(media_info.filename, "rb") as file: body = file.read() tree = decode_body(body, media_info.uri, media_info.media_type) if tree is not None: # Check if this HTML document points to oEmbed information and # defer to that. oembed_url = self._oembed.autodiscover_from_html(tree) og_from_oembed: JsonDict = {} if oembed_url: oembed_info = await self._handle_url( oembed_url, user, allow_data_urls=True ) ( og_from_oembed, author_name, expiration_ms, ) = await self._handle_oembed_response( url, oembed_info, expiration_ms ) # Parse Open Graph information from the HTML in case the oEmbed # response failed or is incomplete. og_from_html = parse_html_to_open_graph(tree, media_info.uri) # Compile the Open Graph response by using the scraped # information from the HTML and overlaying any information # from the oEmbed response. og = {**og_from_html, **og_from_oembed} await self._precache_image_url(user, media_info, og) else: og = {} elif oembed_url: # Handle the oEmbed information. og, author_name, expiration_ms = await self._handle_oembed_response( url, media_info, expiration_ms ) await self._precache_image_url(user, media_info, og) else: logger.warning("Failed to find any OG data in %s", url) og = {} # If we don't have a title but we have author_name, copy it as # title if not og.get("og:title") and author_name: og["og:title"] = author_name # filter out any stupidly long values keys_to_remove = [] for k, v in og.items(): # values can be numeric as well as strings, hence the cast to str if len(k) > OG_TAG_NAME_MAXLEN or len(str(v)) > OG_TAG_VALUE_MAXLEN: logger.warning( "Pruning overlong tag %s from OG data", k[:OG_TAG_NAME_MAXLEN] ) keys_to_remove.append(k) for k in keys_to_remove: del og[k] logger.debug("Calculated OG for %s as %s", url, og) jsonog = json_encoder.encode(og) # Cap the amount of time to consider a response valid. expiration_ms = min(expiration_ms, ONE_DAY) # store OG in history-aware DB cache await self.store.store_url_cache( url, media_info.response_code, media_info.etag, media_info.created_ts_ms + expiration_ms, jsonog, media_info.filesystem_id, media_info.created_ts_ms, ) return jsonog.encode("utf8")
def test_no_tree(self): """A valid body with no tree in it.""" html = b"\x00" tree = decode_body(html, "http://example.com/test.html") self.assertIsNone(tree)
def test_empty(self): """Test a body with no data in it.""" html = b"" tree = decode_body(html, "http://example.com/test.html") self.assertIsNone(tree)