예제 #1
0
 def fixup(match):
     """
     Replace one matched HTML entity with the character it represents,
     if possible.
     """
     text = match.group(0)
     if text[:2] == "&#":
         # character reference
         try:
             if text[:3] == "&#x":
                 codept = int(text[3:-1], 16)
             else:
                 codept = int(text[2:-1])
             if 0x80 <= codept < 0xa0:
                 # Decode this range of characters as Windows-1252, as Web
                 # browsers do in practice.
                 return unichr(codept).encode('latin-1').decode('sloppy-windows-1252')
             else:
                 return unichr(codept)
         except ValueError:
             pass
     else:
         # named entity
         try:
             text = entities[text[1:]]
         except KeyError:
             pass
     return text  # leave as is
예제 #2
0
def _build_regexes():
    """
    ENCODING_REGEXES contain reasonably fast ways to detect if we
    could represent a given string in a given encoding. The simplest one is
    the 'ascii' detector, which of course just determines if all characters
    are between U+0000 and U+007F.
    """
    # Define a regex that matches ASCII text.
    encoding_regexes = {'ascii': re.compile('^[\x00-\x7f]*$')}

    for encoding in CHARMAP_ENCODINGS:
        # Make a sequence of characters that bytes \x80 to \xFF decode to
        # in each encoding, as well as byte \x1A, which is used to represent
        # the replacement character � in the sloppy-* encodings.
        latin1table = ''.join(unichr(i) for i in range(128, 256)) + '\x1a'
        charlist = latin1table.encode('latin-1').decode(encoding)

        # The rest of the ASCII bytes -- bytes \x00 to \x19 and \x1B
        # to \x7F -- will decode as those ASCII characters in any encoding we
        # support, so we can just include them as ranges. This also lets us
        # not worry about escaping regex special characters, because all of
        # them are in the \x1B to \x7F range.
        regex = '^[\x00-\x19\x1b-\x7f{0}]*$'.format(charlist)
        encoding_regexes[encoding] = re.compile(regex)
    return encoding_regexes
예제 #3
0
def _build_regexes():
    """
    ENCODING_REGEXES contain reasonably fast ways to detect if we
    could represent a given string in a given encoding. The simplest one is
    the u'ascii' detector, which of course just determines if all characters
    are between U+0000 and U+007F.
    """
    # Define a regex that matches ASCII text.
    encoding_regexes = {u'ascii': re.compile('^[\x00-\x7f]*$')}

    for encoding in CHARMAP_ENCODINGS:
        # Make a sequence of characters that bytes \x80 to \xFF decode to
        # in each encoding, as well as byte \x1A, which is used to represent
        # the replacement character � in the sloppy-* encodings.
        latin1table = u''.join(unichr(i) for i in range(128, 256)) + '\x1a'
        charlist = latin1table.encode(u'latin-1').decode(encoding)

        # The rest of the ASCII bytes -- bytes \x00 to \x19 and \x1B
        # to \x7F -- will decode as those ASCII characters in any encoding we
        # support, so we can just include them as ranges. This also lets us
        # not worry about escaping regex special characters, because all of
        # them are in the \x1B to \x7F range.
        regex = u'^[\x00-\x19\x1b-\x7f{0}]*$'.format(charlist)
        encoding_regexes[encoding] = re.compile(regex)
    return encoding_regexes
예제 #4
0
def remove_bom(text):
    r"""
    Remove a left-over byte-order mark.

    >>> print(remove_bom("\ufeffWhere do you want to go today?"))
    Where do you want to go today?
    """
    return text.lstrip(unichr(0xfeff))
예제 #5
0
def remove_bom(text):
    r"""
    Remove a left-over byte-order mark.

    >>> print(remove_bom(unichr(0xfeff) + "Where do you want to go today?"))
    Where do you want to go today?
    """
    return text.lstrip(unichr(0xfeff))
예제 #6
0
def remove_bom(text):
    r"""
    Remove a byte-order mark that was accidentally decoded as if it were part
    of the text.

    >>> print(remove_bom("\ufeffWhere do you want to go today?"))
    Where do you want to go today?
    """
    return text.lstrip(unichr(0xfeff))
예제 #7
0
def _build_charmaps():
    """
    CHARMAPS contains mappings from bytes to characters, for each single-byte
    encoding we know about.

    We don't use Python's decoders here because they're too strict. Many
    non-Python programs will leave mysterious bytes alone instead of raising
    an error or removing them. For example, Python will not decode 0x81 in
    Windows-1252 because it doesn't map to anything. Other systems will decode
    it to U+0081, which actually makes no sense because that's a meaningless
    control character from Latin-1, but I guess at least it preserves some
    information that ftfy can take advantage of.

    So that's what we do. When other systems decode 0x81 as U+0081, we match
    their behavior in case it helps us get reasonable text.

    Meanwhile, ENCODING_REGEXES contain reasonably fast ways to detect if we
    could represent a given string in a given encoding. The simplest one is
    the 'ascii' detector, which of course just determines if all characters
    are between U+0000 and U+007F.
    """
    charmaps = {}
    encoding_regexes = {'ascii': re.compile('^[\x00-\x7f]*$')}
    for encoding in CHARMAP_ENCODINGS:
        charmap = {}
        for codepoint in range(0, 0x80):
            charmap[codepoint] = unichr(codepoint)
        for codepoint in range(0x80, 0x100):
            char = unichr(codepoint)
            encoded_char = char.encode('latin-1')
            try:
                decoded_char = encoded_char.decode(encoding)
            except ValueError:
                decoded_char = char
            charmap[ord(decoded_char)] = char

        charlist = [
            unichr(codept) for codept in sorted(charmap.keys())
            if codept >= 0x80
        ]
        regex = '^[\x00-\x7f{}]*$'.format(''.join(charlist))
        charmaps[encoding] = charmap
        encoding_regexes[encoding] = re.compile(regex)
    return charmaps, encoding_regexes
예제 #8
0
def convert_surrogate_pair(match):
    """
    Convert a surrogate pair to the single codepoint it represents.

    This implements the formula described at:
    http://en.wikipedia.org/wiki/Universal_Character_Set_characters#Surrogates
    """
    pair = match.group(0)
    codept = 0x10000 + (ord(pair[0]) - 0xd800) * 0x400 + (ord(pair[1]) - 0xdc00)
    return unichr(codept)
예제 #9
0
def convert_surrogate_pair(match):
    """
    Convert a surrogate pair to the single codepoint it represents.

    This implements the formula described at:
    http://en.wikipedia.org/wiki/Universal_Character_Set_characters#Surrogates
    """
    pair = match.group(0)
    codept = 0x10000 + (ord(pair[0]) - 0xd800) * 0x400 + (ord(pair[1]) - 0xdc00)
    return unichr(codept)
예제 #10
0
def fix_java_encoding(bytestring):
    """
    Convert a bytestring that might contain "Java UTF8" into valid UTF-8.

    There are two things that Java is known to do with its "UTF8" encoder
    that are incompatible with UTF-8. (If you happen to be writing Java
    code, apparently the standards-compliant encoder is named "AS32UTF8".)

    - Every UTF-16 character is separately encoded as UTF-8. This is wrong
      when the UTF-16 string contains surrogates; the character they actually
      represent should have been encoded as UTF-8 instead. Unicode calls this
      "CESU-8", the Compatibility Encoding Scheme for Unicode. Python 2 will
      decode it as if it's UTF-8, but Python 3 refuses to.

    - The null codepoint, U+0000, is encoded as 0xc0 0x80, which avoids
      outputting a null byte by breaking the UTF shortest-form rule.
      Unicode does not even deign to give this scheme a name, and no version
      of Python will decode it.
    """
    assert isinstance(bytestring, bytes)
    # Replace the sloppy encoding of U+0000 with the correct one.
    bytestring = bytestring.replace(b'\xc0\x80', b'\x00')

    # When we have improperly encoded surrogates, we can still see the
    # bits that they were meant to represent.
    #
    # The surrogates were meant to encode a 20-bit number, to which we
    # add 0x10000 to get a codepoint. That 20-bit number now appears in
    # this form:
    #
    #   11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst
    #
    # The CESU8_RE above matches byte sequences of this form. Then we need
    # to extract the bits and assemble a codepoint number from them.
    match = CESU8_RE.search(bytestring)
    fixed_pieces = []
    while match:
        pos = match.start()
        cesu8_sequence = bytes_to_ints(bytestring[pos:pos + 6])
        assert cesu8_sequence[0] == cesu8_sequence[3] == 0xed
        codepoint = (
            ((cesu8_sequence[1] & 0x0f) << 16) +
            ((cesu8_sequence[2] & 0x3f) << 10) +
            ((cesu8_sequence[4] & 0x0f) << 6) +
            (cesu8_sequence[5] & 0x3f) +
            0x10000
        )
        # For the reason why this will work on all Python builds, see
        # compatibility.py.
        new_bytes = unichr(codepoint).encode('utf-8')
        fixed_pieces.append(bytestring[:pos] + new_bytes)
        bytestring = bytestring[pos + 6:]
        match = CESU8_RE.match(bytestring)

    return b''.join(fixed_pieces) + bytestring
예제 #11
0
def _build_charmaps():
    """
    CHARMAPS contains mappings from bytes to characters, for each single-byte
    encoding we know about.

    We don't use Python's decoders here because they're too strict. Many
    non-Python programs will leave mysterious bytes alone instead of raising
    an error or removing them. For example, Python will not decode 0x81 in
    Windows-1252 because it doesn't map to anything. Other systems will decode
    it to U+0081, which actually makes no sense because that's a meaningless
    control character from Latin-1, but I guess at least it preserves some
    information that ftfy can take advantage of.

    So that's what we do. When other systems decode 0x81 as U+0081, we match
    their behavior in case it helps us get reasonable text.

    Meanwhile, ENCODING_REGEXES contain reasonably fast ways to detect if we
    could represent a given string in a given encoding. The simplest one is
    the 'ascii' detector, which of course just determines if all characters
    are between U+0000 and U+007F.
    """
    charmaps = {}
    encoding_regexes = {'ascii': re.compile('^[\x00-\x7f]*$')}
    for encoding in CHARMAP_ENCODINGS:
        charmap = {}
        for codepoint in range(0, 0x80):
            charmap[codepoint] = unichr(codepoint)
        for codepoint in range(0x80, 0x100):
            char = unichr(codepoint)
            encoded_char = char.encode('latin-1')
            try:
                decoded_char = encoded_char.decode(encoding)
            except ValueError:
                decoded_char = char
            charmap[ord(decoded_char)] = char

        charlist = [unichr(codept) for codept in sorted(charmap.keys())
                    if codept >= 0x80]
        regex = '^[\x00-\x7f{}]*$'.format(''.join(charlist))
        charmaps[encoding] = charmap
        encoding_regexes[encoding] = re.compile(regex)
    return charmaps, encoding_regexes
예제 #12
0
def fix_java_encoding(bytestring):
    """
    Convert a bytestring that might contain "Java UTF8" into valid UTF-8.

    There are two things that Java is known to do with its "UTF8" encoder
    that are incompatible with UTF-8. (If you happen to be writing Java
    code, apparently the standards-compliant encoder is named "AS32UTF8".)

    - Every UTF-16 character is separately encoded as UTF-8. This is wrong
      when the UTF-16 string contains surrogates; the character they actually
      represent should have been encoded as UTF-8 instead. Unicode calls this
      "CESU-8", the Compatibility Encoding Scheme for Unicode. Python 2 will
      decode it as if it's UTF-8, but Python 3 refuses to.

    - The null codepoint, U+0000, is encoded as 0xc0 0x80, which avoids
      outputting a null byte by breaking the UTF shortest-form rule.
      Unicode does not even deign to give this scheme a name, and no version
      of Python will decode it.
    """
    assert isinstance(bytestring, bytes)
    # Replace the sloppy encoding of U+0000 with the correct one.
    bytestring = bytestring.replace(b'\xc0\x80', b'\x00')

    # When we have improperly encoded surrogates, we can still see the
    # bits that they were meant to represent.
    #
    # The surrogates were meant to encode a 20-bit number, to which we
    # add 0x10000 to get a codepoint. That 20-bit number now appears in
    # this form:
    #
    #   11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst
    #
    # The CESU8_RE above matches byte sequences of this form. Then we need
    # to extract the bits and assemble a codepoint number from them.
    match = CESU8_RE.search(bytestring)
    fixed_pieces = []
    while match:
        pos = match.start()
        cesu8_sequence = bytes_to_ints(bytestring[pos:pos + 6])
        assert cesu8_sequence[0] == cesu8_sequence[3] == 0xed
        codepoint = (((cesu8_sequence[1] & 0x0f) << 16) +
                     ((cesu8_sequence[2] & 0x3f) << 10) +
                     ((cesu8_sequence[4] & 0x0f) << 6) +
                     (cesu8_sequence[5] & 0x3f) + 0x10000)
        # For the reason why this will work on all Python builds, see
        # compatibility.py.
        new_bytes = unichr(codepoint).encode('utf-8')
        fixed_pieces.append(bytestring[:pos] + new_bytes)
        bytestring = bytestring[pos + 6:]
        match = CESU8_RE.match(bytestring)

    return b''.join(fixed_pieces) + bytestring
예제 #13
0
 def fixup(match):
     """
     Replace one matched HTML entity with the character it represents,
     if possible.
     """
     text = match.group(0)
     if text[:2] == "&#":
         # character reference
         try:
             if text[:3] == "&#x":
                 return unichr(int(text[3:-1], 16))
             else:
                 return unichr(int(text[2:-1]))
         except ValueError:
             pass
     else:
         # named entity
         try:
             text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
         except KeyError:
             pass
     return text  # leave as is
예제 #14
0
 def fixup(match):
     """
     Replace one matched HTML entity with the character it represents,
     if possible.
     """
     text = match.group(0)
     if text[:2] == "&#":
         # character reference
         try:
             if text[:3] == "&#x":
                 return unichr(int(text[3:-1], 16))
             else:
                 return unichr(int(text[2:-1]))
         except ValueError:
             pass
     else:
         # named entity
         try:
             text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
         except KeyError:
             pass
     return text  # leave as is
예제 #15
0
    def _buffer_decode_surrogates(sup, input, errors, final):
        """
        When we have improperly encoded surrogates, we can still see the
        bits that they were meant to represent.

        The surrogates were meant to encode a 20-bit number, to which we
        add 0x10000 to get a codepoint. That 20-bit number now appears in
        this form:

          11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst

        The CESU8_RE above matches byte sequences of this form. Then we need
        to extract the bits and assemble a codepoint number from them.
        """
        if len(input) < 6:
            if final:
                # We found 0xed near the end of the stream, and there aren't
                # six bytes to decode. Delegate to the superclass method to
                # handle it as normal UTF-8. It might be a Hangul character
                # or an error.
                if PYTHON2 and len(input) >= 3:
                    # We can't trust Python 2 to raise an error when it's
                    # asked to decode a surrogate, so let's force the issue.
                    input = mangle_surrogates(input)
                return sup(input, errors, final)
            else:
                # We found 0xed, the stream isn't over yet, and we don't know
                # enough of the following bytes to decode anything, so consume
                # zero bytes and wait.
                return '', 0
        else:
            if CESU8_RE.match(input):
                # If this is a CESU-8 sequence, do some math to pull out
                # the intended 20-bit value, and consume six bytes.
                bytenums = bytes_to_ints(input[:6])
                codepoint = (
                    ((bytenums[1] & 0x0f) << 16) +
                    ((bytenums[2] & 0x3f) << 10) +
                    ((bytenums[4] & 0x0f) << 6) +
                    (bytenums[5] & 0x3f) +
                    0x10000
                )
                return unichr(codepoint), 6
            else:
                # This looked like a CESU-8 sequence, but it wasn't one.
                # 0xed indicates the start of a three-byte sequence, so give
                # three bytes to the superclass to decode as usual -- except
                # for working around the Python 2 discrepancy as before.
                if PYTHON2:
                    input = mangle_surrogates(input)
                return sup(input[:3], errors, False)
예제 #16
0
def _build_width_map():
    """
    Build a translate mapping that replaces halfwidth and fullwidth forms
    with their standard-width forms.
    """
    # Though it's not listed as a fullwidth character, we'll want to convert
    # U+3000 IDEOGRAPHIC SPACE to U+20 SPACE on the same principle, so start
    # with that in the dictionary.
    width_map = {0x3000: u' '}
    for i in range(0xff01, 0xfff0):
        char = unichr(i)
        alternate = unicodedata.normalize(u'NFKC', char)
        if alternate != char:
            width_map[i] = alternate
    return width_map
예제 #17
0
def _build_width_map():
    """
    Build a translate mapping that replaces halfwidth and fullwidth forms
    with their standard-width forms.
    """
    # Though it's not listed as a fullwidth character, we'll want to convert
    # U+3000 IDEOGRAPHIC SPACE to U+20 SPACE on the same principle, so start
    # with that in the dictionary.
    width_map = {0x3000: ' '}
    for i in range(0xff01, 0xfff0):
        char = unichr(i)
        alternate = unicodedata.normalize('NFKC', char)
        if alternate != char:
            width_map[i] = alternate
    return width_map
예제 #18
0
def _build_regexes():
    """
    ENCODING_REGEXES contain reasonably fast ways to detect if we
    could represent a given string in a given encoding. The simplest one is
    the 'ascii' detector, which of course just determines if all characters
    are between U+0000 and U+007F.
    """
    # Define a regex that matches ASCII text.
    encoding_regexes = {'ascii': re.compile('^[\x00-\x7f]*$')}

    for encoding in CHARMAP_ENCODINGS:
        latin1table = ''.join(unichr(i) for i in range(128, 256))
        charlist = latin1table.encode('latin-1').decode(encoding)
        regex = '^[\x00-\x7f{}]*$'.format(charlist.replace('\\', '\\\\'))
        encoding_regexes[encoding] = re.compile(regex)
    return encoding_regexes
def _build_regexes():
    """
    ENCODING_REGEXES contain reasonably fast ways to detect if we
    could represent a given string in a given encoding. The simplest one is
    the 'ascii' detector, which of course just determines if all characters
    are between U+0000 and U+007F.
    """
    # Define a regex that matches ASCII text.
    encoding_regexes = {'ascii': re.compile('^[\x00-\x7f]*$')}

    for encoding in CHARMAP_ENCODINGS:
        latin1table = ''.join(unichr(i) for i in range(128, 256))
        charlist = latin1table.encode('latin-1').decode(encoding)
        regex = '^[\x00-\x7f{}]*$'.format(charlist.replace('\\', '\\\\'))
        encoding_regexes[encoding] = re.compile(regex)
    return encoding_regexes
    def _buffer_decode_surrogates(sup, input, errors, final):
        """
        When we have improperly encoded surrogates, we can still see the
        bits that they were meant to represent.

        The surrogates were meant to encode a 20-bit number, to which we
        add 0x10000 to get a codepoint. That 20-bit number now appears in
        this form:

          11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst

        The CESU8_RE above matches byte sequences of this form. Then we need
        to extract the bits and assemble a codepoint number from them.
        """
        if len(input) < 6:
            if final:
                # We found 0xed near the end of the stream, and there aren't
                # six bytes to decode. Delegate to the superclass method
                # to handle this error.
                return sup(input, errors, final)
            else:
                # We found 0xed, the stream isn't over yet, and we don't know
                # enough of the following bytes to decode anything, so consume
                # zero bytes and wait.
                return '', 0
        else:
            if CESU8_RE.match(input):
                # If this is a CESU-8 sequence, do some math to pull out
                # the intended 20-bit value, and consume six bytes.
                bytenums = bytes_to_ints(input[:6])
                codepoint = (
                    ((bytenums[1] & 0x0f) << 16) +
                    ((bytenums[2] & 0x3f) << 10) +
                    ((bytenums[4] & 0x0f) << 6) +
                    (bytenums[5] & 0x3f) +
                    0x10000
                )
                return unichr(codepoint), 6
            else:
                # This looked like a CESU-8 sequence, but it wasn't one.
                # 0xed indicates the start of a three-byte sequence, so give
                # three bytes to the superclass, so it can either decode them
                # as a surrogate codepoint (on Python 2) or handle the error
                # (on Python 3).
                return sup(input[:3], errors, False)
예제 #21
0
    def _buffer_decode_surrogates(sup, input, errors, final):
        """
        When we have improperly encoded surrogates, we can still see the
        bits that they were meant to represent.

        The surrogates were meant to encode a 20-bit number, to which we
        add 0x10000 to get a codepoint. That 20-bit number now appears in
        this form:

          11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst

        The CESU8_RE above matches byte sequences of this form. Then we need
        to extract the bits and assemble a codepoint number from them.
        """
        if len(input) < 6:
            if final:
                # We found 0xed near the end of the stream, and there aren't
                # six bytes to decode. Delegate to the superclass method
                # to handle this error.
                return sup(input, errors, final)
            else:
                # We found 0xed, the stream isn't over yet, and we don't know
                # enough of the following bytes to decode anything, so consume
                # zero bytes and wait.
                return '', 0
        else:
            if CESU8_RE.match(input):
                # If this is a CESU-8 sequence, do some math to pull out
                # the intended 20-bit value, and consume six bytes.
                bytenums = bytes_to_ints(input[:6])
                codepoint = (((bytenums[1] & 0x0f) << 16) +
                             ((bytenums[2] & 0x3f) << 10) +
                             ((bytenums[4] & 0x0f) << 6) +
                             (bytenums[5] & 0x3f) + 0x10000)
                return unichr(codepoint), 6
            else:
                # This looked like a CESU-8 sequence, but it wasn't one.
                # 0xed indicates the start of a three-byte sequence, so give
                # three bytes to the superclass, so it can either decode them
                # as a surrogate codepoint (on Python 2) or handle the error
                # (on Python 3).
                return sup(input[:3], errors, False)
예제 #22
0
def _build_regexes():
    """
    ENCODING_REGEXES contain reasonably fast ways to detect if we
    could represent a given string in a given encoding. The simplest one is
    the 'ascii' detector, which of course just determines if all characters
    are between U+0000 and U+007F.
    """
    # Define a regex that matches ASCII text.
    encoding_regexes = {'ascii': re.compile('^[\x00-\x7f]*$')}

    for encoding in CHARMAP_ENCODINGS:
        latin1table = ''.join(unichr(i) for i in range(128, 256))
        charlist = latin1table.encode('latin-1').decode(encoding)

        # Build a regex from the ASCII range, followed by the decodings of
        # bytes 0x80-0xff in this character set. (This uses the fact that all
        # regex special characters are ASCII, and therefore won't appear in the
        # string.)
        regex = '^[\x00-\x7f{0}]*$'.format(charlist)
        encoding_regexes[encoding] = re.compile(regex)
    return encoding_regexes
예제 #23
0
def _build_regexes():
    """
    ENCODING_REGEXES contain reasonably fast ways to detect if we
    could represent a given string in a given encoding. The simplest one is
    the 'ascii' detector, which of course just determines if all characters
    are between U+0000 and U+007F.
    """
    # Define a regex that matches ASCII text.
    encoding_regexes = {'ascii': re.compile('^[\x00-\x7f]*$')}

    for encoding in CHARMAP_ENCODINGS:
        latin1table = ''.join(unichr(i) for i in range(128, 256))
        charlist = latin1table.encode('latin-1').decode(encoding)

        # Build a regex from the ASCII range, followed by the decodings of
        # bytes 0x80-0xff in this character set. (This uses the fact that all
        # regex special characters are ASCII, and therefore won't appear in the
        # string.)
        regex = '^[\x00-\x7f{}]*$'.format(charlist)
        encoding_regexes[encoding] = re.compile(regex)
    return encoding_regexes
예제 #24
0
def _build_utf8_punct_regex():
    """
    Recognize UTF-8 mojibake that's so blatant that we can fix it even when the
    rest of the string doesn't decode as UTF-8 -- namely, UTF-8 sequences for
    the 'General Punctuation' characters U+2000 to U+2040, re-encoded in
    Windows-1252.

    These are recognizable by the distinctive 'â€' ('\xe2\x80') sequence they
    all begin with when decoded as Windows-1252.
    """
    # We're making a regex that has all the literal bytes from 0x80 to 0xbf in
    # a range. "Couldn't this have just said [\x80-\xbf]?", you might ask.
    # However, when we decode the regex as Windows-1252, the resulting
    # characters won't even be remotely contiguous.
    #
    # Unrelatedly, the expression that generates these bytes will be so much
    # prettier when we deprecate Python 2.
    continuation_char_list = ''.join(
        unichr(i) for i in range(0x80, 0xc0)).encode('latin-1')
    obvious_utf8 = ('â€[' +
                    continuation_char_list.decode('sloppy-windows-1252') + ']')
    return re.compile(obvious_utf8)
예제 #25
0
def _build_utf8_punct_regex():
    """
    Recognize UTF-8 mojibake that's so blatant that we can fix it even when the
    rest of the string doesn't decode as UTF-8 -- namely, UTF-8 sequences for
    the u'General Punctuation' characters U+2000 to U+2040, re-encoded in
    Windows-1252.

    These are recognizable by the distinctiveu'â€u' ('\xe2\x80') sequence they
    all begin with when decoded as Windows-1252.
    """
    # We're making a regex that has all the literal bytes from 0x80 to 0xbf in
    # a range. "Couldn't this have just said [\x80-\xbf]?", you might ask.
    # However, when we decode the regex as Windows-1252, the resulting
    # characters won't even be remotely contiguous.
    #
    # Unrelatedly, the expression that generates these bytes will be so much
    # prettier when we deprecate Python 2.
    continuation_char_list = ''.join(
        unichr(i) for i in range(0x80, 0xc0)
    ).encode(u'latin-1')
    obvious_utf8 = (u'â€['
                    + continuation_char_list.decode(u'sloppy-windows-1252')
                    + u']')
    return re.compile(obvious_utf8)
예제 #26
0
def remove_bom(text):
    """
    Remove a left-over byte-order mark.
    """
    return text.lstrip(unichr(0xfeff))
예제 #27
0
def remove_bom(text):
    """
    Remove a left-over byte-order mark.
    """
    return text.lstrip(unichr(0xfeff))