def test_implicit_utf8(self): """Test whether UTF-8 is tried even when not specified.""" to_try = (('ascii', 'bad header'), ) with self.assertLogs(logger, logging.WARNING): text, encoding = decode_and_report(b'smile \xf0\x9f\x98\x83', to_try, logger) self.assertEqual(text, 'smile \U0001f603') self.assertEqual(encoding, 'utf-8')
def test_nonstandard(self): """Test handling of a non-standard encoding name.""" def to_try(): yield 'ascii', 'header' with self.assertLogs(logger, logging.INFO): text, encoding = decode_and_report(b'Hello', to_try(), logger) self.assertEqual(text, 'Hello') self.assertEqual(encoding, 'us-ascii')
def test_trivial(self): """Test an input that should succeed without logging.""" def to_try(): yield 'us-ascii', 'header' with no_log(logger): text, encoding = decode_and_report(b'Hello', to_try(), logger) self.assertEqual(text, 'Hello') self.assertEqual(encoding, 'us-ascii')
def test_invalid(self): """Test what happens when there is no valid way to decode.""" to_try = ( ('us-ascii', 'HTTP header'), (None, 'Unicode BOM'), ('utf-8', 'XML declaration'), ) with self.assertRaises(ValueError): text, encoding = decode_and_report(b'cut-off smile \xf0\x9f\x98', to_try, logger)
def test_none(self): """Test whether None entries are ignored.""" to_try = ( (None, 'HTTP header'), ('utf-8', 'XML declaration'), (None, 'Unicode BOM'), ) with no_log(logger): text, encoding = decode_and_report(b'smile \xf0\x9f\x98\x83', to_try, logger) self.assertEqual(text, 'smile \U0001f603') self.assertEqual(encoding, 'utf-8')
def load_text( url: str, accept_header: str = 'text/plain' ) -> Tuple[Report, Optional[addinfourl], Optional[List[str]]]: """Load a text document. @param url: The URL of the document to load. @param accept_header: HTTP C{Accept} header to use for the request. @return: C{(report, response, contents)} C{report} is a L{Report} instance that may already have some messages logged to it. C{response} is an L{http.client.HTTPResponse} object if a response was received from the server, or C{None} otherwise. C{contents} is the document as a list of lines, or C{None} if the loading failed. """ redirect_count = 0 while True: report, response, content_bytes = load_page( url, accept_header=accept_header) if response is not None: if response.code in (200, None): break if response.code in (301, 302, 303, 307): redirect_count += 1 if redirect_count <= 10: # Note: The new URL could be outside our crawl root, # but since this function is not used for the # actual crawling, that is fine. url = response.url continue report.warning('Redirect limit exceeded') return report, response, None assert content_bytes is not None bom_encoding = encoding_from_bom(content_bytes) http_encoding = response.headers.get_content_charset() try: content, used_encoding_ = decode_and_report( content_bytes, ((bom_encoding, 'Byte Order Mark'), (http_encoding, 'HTTP header')), report) except ValueError as ex: report.error('Failed to decode text document: %s', ex) return report, response, None else: return report, response, _RE_EOLN.split(content)
def _check_response(self, req_url: str, report: Report, response: addinfourl, content_bytes: bytes) -> Iterator[Referrer]: """Check the server's response to a request.""" if response.code not in (200, None): # TODO: This should probably be user-selectable. # A lot of web servers produce error and redirection # notices that are not HTML5 compliant. Checking the # content is likely only useful if the application # under test is producing the content instead. report.info('Skipping content check because of HTTP status %d', response.code) report.checked = Checked.HTTP_STATUS_SKIP return headers = response.headers content_type_header = headers['Content-Type'] if content_type_header is None: message = 'Missing Content-Type header' _LOG.error(message) report.error(message) return else: # Convert Header to plain string. content_type_header = str(content_type_header) content_type = headers.get_content_type() is_html = content_type in ('text/html', 'application/xhtml+xml') is_xml = content_type.endswith('/xml') or content_type.endswith('+xml') http_encoding = headers.get_content_charset() # Speculatively decode the first 1024 bytes, so we can look inside # the document for encoding clues. bom_encoding = encoding_from_bom(content_bytes) content_head = content_bytes[:1024].decode(bom_encoding or 'ascii', 'replace') if not is_xml and content_head.startswith('<?xml'): is_xml = True if req_url.startswith('file:'): # Silently correct content-type detection for local files. # This is not something the user can easily fix, so issuing # a warning would not be helpful. if content_type == 'text/html': content_type = 'application/xhtml+xml' else: report.warning( 'Document is served with content type "%s" ' 'but starts with an XML declaration', content_type) if is_html and is_xml and self.accept is Accept.HTML: report.warning( 'HTML document is serialized as XML, while the HTTP Accept ' 'header did not include "application/xhtml+xml"') if is_xml or content_type.startswith('text/'): # This looks like a text document, now figure out the encoding. # Look for encoding in XML declaration (if any). decl_encoding = encoding_from_xml_decl(content_head) # TODO: Also look at HTML <meta> tags. # Try possible encodings in order of precedence. # W3C recommends giving the BOM, if present, precedence over HTTP. # http://www.w3.org/International/questions/qa-byte-order-mark try: content, used_encoding = decode_and_report( content_bytes, ((bom_encoding, 'Byte Order Mark'), (decl_encoding, 'XML declaration'), (http_encoding, 'HTTP header')), report) except ValueError as ex: # All likely encodings failed. report.error('Failed to decode contents: %s', ex) else: if req_url.startswith('file:'): # Construct a new header that is likely more accurate. content_type_header = \ f'{content_type}; charset={used_encoding}' if is_html or is_xml: tree = parse_document(content, is_xml, report) if tree is not None: if repair_tree(tree, content_type, report): # Offer the repaired tree to plugins, so they # are more likely to be able to do their work. repaired = etree.tostring(tree, encoding='utf-8') assert isinstance(repaired, bytes) content_bytes = repaired # Find links to other documents. yield from self.find_referrers_in_xml( tree, req_url, report) if is_html: yield from self.find_referrers_in_html( tree, req_url) self.plugins.resource_loaded(content_bytes, content_type_header, report)