예제 #1
0
파일: test.py 프로젝트: Connexions/tinycss2
def test_stylesheet_bytes(kwargs):
    kwargs['css_bytes'] = kwargs['css_bytes'].encode('latin1')
    kwargs.pop('comment', None)
    if kwargs.get('environment_encoding'):
        kwargs['environment_encoding'] = lookup(kwargs['environment_encoding'])
    kwargs.update(SKIP)
    return parse_stylesheet_bytes(**kwargs)
예제 #2
0
    def detect_xml_encoding(self):
        """ Detects an encoding from an XML prolog """

        match = _RE_XML_ENCODING.search(self.doc.source_data)
        if match:
            detected = webencodings.lookup(match.group(1))
            if detected:
                return detected.codec_info
예제 #3
0
    def guess_encoding(self):
        """ Makes an expensive guess of the charset with the chardet library """

        # TODO: would it be faster to look only in the first N thousand bytes?
        detected = cchardet.detect(self.doc.source_data)
        if detected.get("encoding"):
            c = webencodings.lookup(detected.get("encoding"))
            if c:
                return c.codec_info
예제 #4
0
파일: bytes.py 프로젝트: spladug/tinycss2
def decode_stylesheet_bytes(css_bytes, protocol_encoding=None,
                            environment_encoding=None):
    """Determine the character encoding of a CSS stylesheet and decode it.

    This is based on the presence of a ,
    an ``@charset`` rule,
    and encoding meta-information.

    :param css_bytes: A byte string.
    :param protocol_encoding:
        The encoding label, if any, defined by HTTP or equivalent protocol.
        (e.g. via the ``charset`` parameter of the ``Content-Type`` header.)
    :param environment_encoding:
        A :class:`webencodings.Encoding` object
        for the `environment encoding
        <http://www.w3.org/TR/css-syntax/#environment-encoding>`_,
        if any.
    :returns:
        A 2-tuple of a decoded Unicode string
        and the :class:`webencodings.Encoding` object that was used.

    """
    # http://dev.w3.org/csswg/css-syntax/#the-input-byte-stream
    if protocol_encoding:
        fallback = lookup(protocol_encoding)
        if fallback:
            return decode(css_bytes, fallback)
    if css_bytes.startswith(b'@charset "'):
        # 10 is len(b'@charset "')
        # 100 is arbitrary so that no encoding label is more than 100-10 bytes.
        end_quote = css_bytes.find(b'"', 10, 100)
        if end_quote != -1 and css_bytes.startswith(b'";', end_quote):
            fallback = lookup(css_bytes[10:end_quote].decode('latin1'))
            if fallback:
                if fallback.name in ('utf-16be', 'utf-16le'):
                    return decode(css_bytes, UTF8)
                return decode(css_bytes, fallback)
    if environment_encoding:
        return decode(css_bytes, environment_encoding)
    return decode(css_bytes, UTF8)
예제 #5
0
    def detect_meta_charset(self):
        """ Returns the encoding found in meta tags in the doc """

        for node in GUMBOCY_PARSER_HEAD.listnodes():
            if node[1] == "meta" and len(node) > 2:
                if node[2].get("charset"):
                    detected = webencodings.lookup(node[2]["charset"])
                    if detected:
                        return detected.codec_info
                elif node[2].get("http-equiv", "").lower().strip() == "content-type":
                    meta_encoding = get_encoding_from_content_type(node[2].get("content", ""))
                    if meta_encoding:
                        return meta_encoding
예제 #6
0
    def detect_meta_charset(self):
        """ Returns the encoding found in meta tags in the doc """

        for node in GUMBOCY_PARSER_HEAD.listnodes():
            if node[1] == "meta" and len(node) > 2:
                if node[2].get("charset"):
                    detected = webencodings.lookup(node[2]["charset"])
                    if detected:
                        return detected.codec_info
                elif node[2].get("http-equiv",
                                 "").lower().strip() == "content-type":
                    meta_encoding = get_encoding_from_content_type(node[2].get(
                        "content", ""))
                    if meta_encoding:
                        return meta_encoding
예제 #7
0
def lookupEncoding(encoding):
    """Return the python codec name corresponding to an encoding or None if the
    string doesn't correspond to a valid encoding."""
    if isinstance(encoding, bytes):
        try:
            encoding = encoding.decode("ascii")
        except UnicodeDecodeError:
            return None

    if encoding is not None:
        try:
            return webencodings.lookup(encoding)
        except AttributeError:
            return None
    else:
        return None
예제 #8
0
def lookupEncoding(encoding):
    """Return the python codec name corresponding to an encoding or None if the
    string doesn't correspond to a valid encoding."""
    if isinstance(encoding, binary_type):
        try:
            encoding = encoding.decode("ascii")
        except UnicodeDecodeError:
            return None

    if encoding is not None:
        try:
            return webencodings.lookup(encoding)
        except AttributeError:
            return None
    else:
        return None
예제 #9
0
def get_encoding_from_content_type(content_type):
    _, params = cgi.parse_header(content_type.decode("ascii", "ignore"))
    if params.get("charset"):
        detected = webencodings.lookup(params["charset"])
        if detected:
            return detected.codec_info
예제 #10
0
def get_encoding_from_content_type(content_type):
    _, params = cgi.parse_header(content_type.decode("ascii", "ignore"))
    if params.get("charset"):
        detected = webencodings.lookup(params["charset"])
        if detected:
            return detected.codec_info