def test_stylesheet_bytes(kwargs): kwargs['css_bytes'] = kwargs['css_bytes'].encode('latin1') kwargs.pop('comment', None) if kwargs.get('environment_encoding'): kwargs['environment_encoding'] = lookup(kwargs['environment_encoding']) kwargs.update(SKIP) return parse_stylesheet_bytes(**kwargs)
def detect_xml_encoding(self): """ Detects an encoding from an XML prolog """ match = _RE_XML_ENCODING.search(self.doc.source_data) if match: detected = webencodings.lookup(match.group(1)) if detected: return detected.codec_info
def guess_encoding(self): """ Makes an expensive guess of the charset with the chardet library """ # TODO: would it be faster to look only in the first N thousand bytes? detected = cchardet.detect(self.doc.source_data) if detected.get("encoding"): c = webencodings.lookup(detected.get("encoding")) if c: return c.codec_info
def decode_stylesheet_bytes(css_bytes, protocol_encoding=None, environment_encoding=None): """Determine the character encoding of a CSS stylesheet and decode it. This is based on the presence of a , an ``@charset`` rule, and encoding meta-information. :param css_bytes: A byte string. :param protocol_encoding: The encoding label, if any, defined by HTTP or equivalent protocol. (e.g. via the ``charset`` parameter of the ``Content-Type`` header.) :param environment_encoding: A :class:`webencodings.Encoding` object for the `environment encoding <http://www.w3.org/TR/css-syntax/#environment-encoding>`_, if any. :returns: A 2-tuple of a decoded Unicode string and the :class:`webencodings.Encoding` object that was used. """ # http://dev.w3.org/csswg/css-syntax/#the-input-byte-stream if protocol_encoding: fallback = lookup(protocol_encoding) if fallback: return decode(css_bytes, fallback) if css_bytes.startswith(b'@charset "'): # 10 is len(b'@charset "') # 100 is arbitrary so that no encoding label is more than 100-10 bytes. end_quote = css_bytes.find(b'"', 10, 100) if end_quote != -1 and css_bytes.startswith(b'";', end_quote): fallback = lookup(css_bytes[10:end_quote].decode('latin1')) if fallback: if fallback.name in ('utf-16be', 'utf-16le'): return decode(css_bytes, UTF8) return decode(css_bytes, fallback) if environment_encoding: return decode(css_bytes, environment_encoding) return decode(css_bytes, UTF8)
def detect_meta_charset(self): """ Returns the encoding found in meta tags in the doc """ for node in GUMBOCY_PARSER_HEAD.listnodes(): if node[1] == "meta" and len(node) > 2: if node[2].get("charset"): detected = webencodings.lookup(node[2]["charset"]) if detected: return detected.codec_info elif node[2].get("http-equiv", "").lower().strip() == "content-type": meta_encoding = get_encoding_from_content_type(node[2].get("content", "")) if meta_encoding: return meta_encoding
def detect_meta_charset(self): """ Returns the encoding found in meta tags in the doc """ for node in GUMBOCY_PARSER_HEAD.listnodes(): if node[1] == "meta" and len(node) > 2: if node[2].get("charset"): detected = webencodings.lookup(node[2]["charset"]) if detected: return detected.codec_info elif node[2].get("http-equiv", "").lower().strip() == "content-type": meta_encoding = get_encoding_from_content_type(node[2].get( "content", "")) if meta_encoding: return meta_encoding
def lookupEncoding(encoding): """Return the python codec name corresponding to an encoding or None if the string doesn't correspond to a valid encoding.""" if isinstance(encoding, bytes): try: encoding = encoding.decode("ascii") except UnicodeDecodeError: return None if encoding is not None: try: return webencodings.lookup(encoding) except AttributeError: return None else: return None
def lookupEncoding(encoding): """Return the python codec name corresponding to an encoding or None if the string doesn't correspond to a valid encoding.""" if isinstance(encoding, binary_type): try: encoding = encoding.decode("ascii") except UnicodeDecodeError: return None if encoding is not None: try: return webencodings.lookup(encoding) except AttributeError: return None else: return None
def get_encoding_from_content_type(content_type): _, params = cgi.parse_header(content_type.decode("ascii", "ignore")) if params.get("charset"): detected = webencodings.lookup(params["charset"]) if detected: return detected.codec_info
def get_encoding_from_content_type(content_type): _, params = cgi.parse_header(content_type.decode("ascii", "ignore")) if params.get("charset"): detected = webencodings.lookup(params["charset"]) if detected: return detected.codec_info