def fixup(match): """ Replace one matched HTML entity with the character it represents, if possible. """ text = match.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": codept = int(text[3:-1], 16) else: codept = int(text[2:-1]) if 0x80 <= codept < 0xa0: # Decode this range of characters as Windows-1252, as Web # browsers do in practice. return unichr(codept).encode('latin-1').decode('sloppy-windows-1252') else: return unichr(codept) except ValueError: pass else: # named entity try: text = entities[text[1:]] except KeyError: pass return text # leave as is
def _build_regexes(): """ ENCODING_REGEXES contain reasonably fast ways to detect if we could represent a given string in a given encoding. The simplest one is the 'ascii' detector, which of course just determines if all characters are between U+0000 and U+007F. """ # Define a regex that matches ASCII text. encoding_regexes = {'ascii': re.compile('^[\x00-\x7f]*$')} for encoding in CHARMAP_ENCODINGS: # Make a sequence of characters that bytes \x80 to \xFF decode to # in each encoding, as well as byte \x1A, which is used to represent # the replacement character � in the sloppy-* encodings. latin1table = ''.join(unichr(i) for i in range(128, 256)) + '\x1a' charlist = latin1table.encode('latin-1').decode(encoding) # The rest of the ASCII bytes -- bytes \x00 to \x19 and \x1B # to \x7F -- will decode as those ASCII characters in any encoding we # support, so we can just include them as ranges. This also lets us # not worry about escaping regex special characters, because all of # them are in the \x1B to \x7F range. regex = '^[\x00-\x19\x1b-\x7f{0}]*$'.format(charlist) encoding_regexes[encoding] = re.compile(regex) return encoding_regexes
def _build_regexes(): """ ENCODING_REGEXES contain reasonably fast ways to detect if we could represent a given string in a given encoding. The simplest one is the u'ascii' detector, which of course just determines if all characters are between U+0000 and U+007F. """ # Define a regex that matches ASCII text. encoding_regexes = {u'ascii': re.compile('^[\x00-\x7f]*$')} for encoding in CHARMAP_ENCODINGS: # Make a sequence of characters that bytes \x80 to \xFF decode to # in each encoding, as well as byte \x1A, which is used to represent # the replacement character � in the sloppy-* encodings. latin1table = u''.join(unichr(i) for i in range(128, 256)) + '\x1a' charlist = latin1table.encode(u'latin-1').decode(encoding) # The rest of the ASCII bytes -- bytes \x00 to \x19 and \x1B # to \x7F -- will decode as those ASCII characters in any encoding we # support, so we can just include them as ranges. This also lets us # not worry about escaping regex special characters, because all of # them are in the \x1B to \x7F range. regex = u'^[\x00-\x19\x1b-\x7f{0}]*$'.format(charlist) encoding_regexes[encoding] = re.compile(regex) return encoding_regexes
def remove_bom(text): r""" Remove a left-over byte-order mark. >>> print(remove_bom("\ufeffWhere do you want to go today?")) Where do you want to go today? """ return text.lstrip(unichr(0xfeff))
def remove_bom(text): r""" Remove a left-over byte-order mark. >>> print(remove_bom(unichr(0xfeff) + "Where do you want to go today?")) Where do you want to go today? """ return text.lstrip(unichr(0xfeff))
def remove_bom(text): r""" Remove a byte-order mark that was accidentally decoded as if it were part of the text. >>> print(remove_bom("\ufeffWhere do you want to go today?")) Where do you want to go today? """ return text.lstrip(unichr(0xfeff))
def _build_charmaps(): """ CHARMAPS contains mappings from bytes to characters, for each single-byte encoding we know about. We don't use Python's decoders here because they're too strict. Many non-Python programs will leave mysterious bytes alone instead of raising an error or removing them. For example, Python will not decode 0x81 in Windows-1252 because it doesn't map to anything. Other systems will decode it to U+0081, which actually makes no sense because that's a meaningless control character from Latin-1, but I guess at least it preserves some information that ftfy can take advantage of. So that's what we do. When other systems decode 0x81 as U+0081, we match their behavior in case it helps us get reasonable text. Meanwhile, ENCODING_REGEXES contain reasonably fast ways to detect if we could represent a given string in a given encoding. The simplest one is the 'ascii' detector, which of course just determines if all characters are between U+0000 and U+007F. """ charmaps = {} encoding_regexes = {'ascii': re.compile('^[\x00-\x7f]*$')} for encoding in CHARMAP_ENCODINGS: charmap = {} for codepoint in range(0, 0x80): charmap[codepoint] = unichr(codepoint) for codepoint in range(0x80, 0x100): char = unichr(codepoint) encoded_char = char.encode('latin-1') try: decoded_char = encoded_char.decode(encoding) except ValueError: decoded_char = char charmap[ord(decoded_char)] = char charlist = [ unichr(codept) for codept in sorted(charmap.keys()) if codept >= 0x80 ] regex = '^[\x00-\x7f{}]*$'.format(''.join(charlist)) charmaps[encoding] = charmap encoding_regexes[encoding] = re.compile(regex) return charmaps, encoding_regexes
def convert_surrogate_pair(match): """ Convert a surrogate pair to the single codepoint it represents. This implements the formula described at: http://en.wikipedia.org/wiki/Universal_Character_Set_characters#Surrogates """ pair = match.group(0) codept = 0x10000 + (ord(pair[0]) - 0xd800) * 0x400 + (ord(pair[1]) - 0xdc00) return unichr(codept)
def convert_surrogate_pair(match): """ Convert a surrogate pair to the single codepoint it represents. This implements the formula described at: http://en.wikipedia.org/wiki/Universal_Character_Set_characters#Surrogates """ pair = match.group(0) codept = 0x10000 + (ord(pair[0]) - 0xd800) * 0x400 + (ord(pair[1]) - 0xdc00) return unichr(codept)
def fix_java_encoding(bytestring): """ Convert a bytestring that might contain "Java UTF8" into valid UTF-8. There are two things that Java is known to do with its "UTF8" encoder that are incompatible with UTF-8. (If you happen to be writing Java code, apparently the standards-compliant encoder is named "AS32UTF8".) - Every UTF-16 character is separately encoded as UTF-8. This is wrong when the UTF-16 string contains surrogates; the character they actually represent should have been encoded as UTF-8 instead. Unicode calls this "CESU-8", the Compatibility Encoding Scheme for Unicode. Python 2 will decode it as if it's UTF-8, but Python 3 refuses to. - The null codepoint, U+0000, is encoded as 0xc0 0x80, which avoids outputting a null byte by breaking the UTF shortest-form rule. Unicode does not even deign to give this scheme a name, and no version of Python will decode it. """ assert isinstance(bytestring, bytes) # Replace the sloppy encoding of U+0000 with the correct one. bytestring = bytestring.replace(b'\xc0\x80', b'\x00') # When we have improperly encoded surrogates, we can still see the # bits that they were meant to represent. # # The surrogates were meant to encode a 20-bit number, to which we # add 0x10000 to get a codepoint. That 20-bit number now appears in # this form: # # 11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst # # The CESU8_RE above matches byte sequences of this form. Then we need # to extract the bits and assemble a codepoint number from them. match = CESU8_RE.search(bytestring) fixed_pieces = [] while match: pos = match.start() cesu8_sequence = bytes_to_ints(bytestring[pos:pos + 6]) assert cesu8_sequence[0] == cesu8_sequence[3] == 0xed codepoint = ( ((cesu8_sequence[1] & 0x0f) << 16) + ((cesu8_sequence[2] & 0x3f) << 10) + ((cesu8_sequence[4] & 0x0f) << 6) + (cesu8_sequence[5] & 0x3f) + 0x10000 ) # For the reason why this will work on all Python builds, see # compatibility.py. new_bytes = unichr(codepoint).encode('utf-8') fixed_pieces.append(bytestring[:pos] + new_bytes) bytestring = bytestring[pos + 6:] match = CESU8_RE.match(bytestring) return b''.join(fixed_pieces) + bytestring
def _build_charmaps(): """ CHARMAPS contains mappings from bytes to characters, for each single-byte encoding we know about. We don't use Python's decoders here because they're too strict. Many non-Python programs will leave mysterious bytes alone instead of raising an error or removing them. For example, Python will not decode 0x81 in Windows-1252 because it doesn't map to anything. Other systems will decode it to U+0081, which actually makes no sense because that's a meaningless control character from Latin-1, but I guess at least it preserves some information that ftfy can take advantage of. So that's what we do. When other systems decode 0x81 as U+0081, we match their behavior in case it helps us get reasonable text. Meanwhile, ENCODING_REGEXES contain reasonably fast ways to detect if we could represent a given string in a given encoding. The simplest one is the 'ascii' detector, which of course just determines if all characters are between U+0000 and U+007F. """ charmaps = {} encoding_regexes = {'ascii': re.compile('^[\x00-\x7f]*$')} for encoding in CHARMAP_ENCODINGS: charmap = {} for codepoint in range(0, 0x80): charmap[codepoint] = unichr(codepoint) for codepoint in range(0x80, 0x100): char = unichr(codepoint) encoded_char = char.encode('latin-1') try: decoded_char = encoded_char.decode(encoding) except ValueError: decoded_char = char charmap[ord(decoded_char)] = char charlist = [unichr(codept) for codept in sorted(charmap.keys()) if codept >= 0x80] regex = '^[\x00-\x7f{}]*$'.format(''.join(charlist)) charmaps[encoding] = charmap encoding_regexes[encoding] = re.compile(regex) return charmaps, encoding_regexes
def fix_java_encoding(bytestring): """ Convert a bytestring that might contain "Java UTF8" into valid UTF-8. There are two things that Java is known to do with its "UTF8" encoder that are incompatible with UTF-8. (If you happen to be writing Java code, apparently the standards-compliant encoder is named "AS32UTF8".) - Every UTF-16 character is separately encoded as UTF-8. This is wrong when the UTF-16 string contains surrogates; the character they actually represent should have been encoded as UTF-8 instead. Unicode calls this "CESU-8", the Compatibility Encoding Scheme for Unicode. Python 2 will decode it as if it's UTF-8, but Python 3 refuses to. - The null codepoint, U+0000, is encoded as 0xc0 0x80, which avoids outputting a null byte by breaking the UTF shortest-form rule. Unicode does not even deign to give this scheme a name, and no version of Python will decode it. """ assert isinstance(bytestring, bytes) # Replace the sloppy encoding of U+0000 with the correct one. bytestring = bytestring.replace(b'\xc0\x80', b'\x00') # When we have improperly encoded surrogates, we can still see the # bits that they were meant to represent. # # The surrogates were meant to encode a 20-bit number, to which we # add 0x10000 to get a codepoint. That 20-bit number now appears in # this form: # # 11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst # # The CESU8_RE above matches byte sequences of this form. Then we need # to extract the bits and assemble a codepoint number from them. match = CESU8_RE.search(bytestring) fixed_pieces = [] while match: pos = match.start() cesu8_sequence = bytes_to_ints(bytestring[pos:pos + 6]) assert cesu8_sequence[0] == cesu8_sequence[3] == 0xed codepoint = (((cesu8_sequence[1] & 0x0f) << 16) + ((cesu8_sequence[2] & 0x3f) << 10) + ((cesu8_sequence[4] & 0x0f) << 6) + (cesu8_sequence[5] & 0x3f) + 0x10000) # For the reason why this will work on all Python builds, see # compatibility.py. new_bytes = unichr(codepoint).encode('utf-8') fixed_pieces.append(bytestring[:pos] + new_bytes) bytestring = bytestring[pos + 6:] match = CESU8_RE.match(bytestring) return b''.join(fixed_pieces) + bytestring
def fixup(match): """ Replace one matched HTML entity with the character it represents, if possible. """ text = match.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": return unichr(int(text[3:-1], 16)) else: return unichr(int(text[2:-1])) except ValueError: pass else: # named entity try: text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) except KeyError: pass return text # leave as is
def fixup(match): """ Replace one matched HTML entity with the character it represents, if possible. """ text = match.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": return unichr(int(text[3:-1], 16)) else: return unichr(int(text[2:-1])) except ValueError: pass else: # named entity try: text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) except KeyError: pass return text # leave as is
def _buffer_decode_surrogates(sup, input, errors, final): """ When we have improperly encoded surrogates, we can still see the bits that they were meant to represent. The surrogates were meant to encode a 20-bit number, to which we add 0x10000 to get a codepoint. That 20-bit number now appears in this form: 11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst The CESU8_RE above matches byte sequences of this form. Then we need to extract the bits and assemble a codepoint number from them. """ if len(input) < 6: if final: # We found 0xed near the end of the stream, and there aren't # six bytes to decode. Delegate to the superclass method to # handle it as normal UTF-8. It might be a Hangul character # or an error. if PYTHON2 and len(input) >= 3: # We can't trust Python 2 to raise an error when it's # asked to decode a surrogate, so let's force the issue. input = mangle_surrogates(input) return sup(input, errors, final) else: # We found 0xed, the stream isn't over yet, and we don't know # enough of the following bytes to decode anything, so consume # zero bytes and wait. return '', 0 else: if CESU8_RE.match(input): # If this is a CESU-8 sequence, do some math to pull out # the intended 20-bit value, and consume six bytes. bytenums = bytes_to_ints(input[:6]) codepoint = ( ((bytenums[1] & 0x0f) << 16) + ((bytenums[2] & 0x3f) << 10) + ((bytenums[4] & 0x0f) << 6) + (bytenums[5] & 0x3f) + 0x10000 ) return unichr(codepoint), 6 else: # This looked like a CESU-8 sequence, but it wasn't one. # 0xed indicates the start of a three-byte sequence, so give # three bytes to the superclass to decode as usual -- except # for working around the Python 2 discrepancy as before. if PYTHON2: input = mangle_surrogates(input) return sup(input[:3], errors, False)
def _build_width_map(): """ Build a translate mapping that replaces halfwidth and fullwidth forms with their standard-width forms. """ # Though it's not listed as a fullwidth character, we'll want to convert # U+3000 IDEOGRAPHIC SPACE to U+20 SPACE on the same principle, so start # with that in the dictionary. width_map = {0x3000: u' '} for i in range(0xff01, 0xfff0): char = unichr(i) alternate = unicodedata.normalize(u'NFKC', char) if alternate != char: width_map[i] = alternate return width_map
def _build_width_map(): """ Build a translate mapping that replaces halfwidth and fullwidth forms with their standard-width forms. """ # Though it's not listed as a fullwidth character, we'll want to convert # U+3000 IDEOGRAPHIC SPACE to U+20 SPACE on the same principle, so start # with that in the dictionary. width_map = {0x3000: ' '} for i in range(0xff01, 0xfff0): char = unichr(i) alternate = unicodedata.normalize('NFKC', char) if alternate != char: width_map[i] = alternate return width_map
def _build_regexes(): """ ENCODING_REGEXES contain reasonably fast ways to detect if we could represent a given string in a given encoding. The simplest one is the 'ascii' detector, which of course just determines if all characters are between U+0000 and U+007F. """ # Define a regex that matches ASCII text. encoding_regexes = {'ascii': re.compile('^[\x00-\x7f]*$')} for encoding in CHARMAP_ENCODINGS: latin1table = ''.join(unichr(i) for i in range(128, 256)) charlist = latin1table.encode('latin-1').decode(encoding) regex = '^[\x00-\x7f{}]*$'.format(charlist.replace('\\', '\\\\')) encoding_regexes[encoding] = re.compile(regex) return encoding_regexes
def _build_regexes(): """ ENCODING_REGEXES contain reasonably fast ways to detect if we could represent a given string in a given encoding. The simplest one is the 'ascii' detector, which of course just determines if all characters are between U+0000 and U+007F. """ # Define a regex that matches ASCII text. encoding_regexes = {'ascii': re.compile('^[\x00-\x7f]*$')} for encoding in CHARMAP_ENCODINGS: latin1table = ''.join(unichr(i) for i in range(128, 256)) charlist = latin1table.encode('latin-1').decode(encoding) regex = '^[\x00-\x7f{}]*$'.format(charlist.replace('\\', '\\\\')) encoding_regexes[encoding] = re.compile(regex) return encoding_regexes
def _buffer_decode_surrogates(sup, input, errors, final): """ When we have improperly encoded surrogates, we can still see the bits that they were meant to represent. The surrogates were meant to encode a 20-bit number, to which we add 0x10000 to get a codepoint. That 20-bit number now appears in this form: 11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst The CESU8_RE above matches byte sequences of this form. Then we need to extract the bits and assemble a codepoint number from them. """ if len(input) < 6: if final: # We found 0xed near the end of the stream, and there aren't # six bytes to decode. Delegate to the superclass method # to handle this error. return sup(input, errors, final) else: # We found 0xed, the stream isn't over yet, and we don't know # enough of the following bytes to decode anything, so consume # zero bytes and wait. return '', 0 else: if CESU8_RE.match(input): # If this is a CESU-8 sequence, do some math to pull out # the intended 20-bit value, and consume six bytes. bytenums = bytes_to_ints(input[:6]) codepoint = ( ((bytenums[1] & 0x0f) << 16) + ((bytenums[2] & 0x3f) << 10) + ((bytenums[4] & 0x0f) << 6) + (bytenums[5] & 0x3f) + 0x10000 ) return unichr(codepoint), 6 else: # This looked like a CESU-8 sequence, but it wasn't one. # 0xed indicates the start of a three-byte sequence, so give # three bytes to the superclass, so it can either decode them # as a surrogate codepoint (on Python 2) or handle the error # (on Python 3). return sup(input[:3], errors, False)
def _buffer_decode_surrogates(sup, input, errors, final): """ When we have improperly encoded surrogates, we can still see the bits that they were meant to represent. The surrogates were meant to encode a 20-bit number, to which we add 0x10000 to get a codepoint. That 20-bit number now appears in this form: 11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst The CESU8_RE above matches byte sequences of this form. Then we need to extract the bits and assemble a codepoint number from them. """ if len(input) < 6: if final: # We found 0xed near the end of the stream, and there aren't # six bytes to decode. Delegate to the superclass method # to handle this error. return sup(input, errors, final) else: # We found 0xed, the stream isn't over yet, and we don't know # enough of the following bytes to decode anything, so consume # zero bytes and wait. return '', 0 else: if CESU8_RE.match(input): # If this is a CESU-8 sequence, do some math to pull out # the intended 20-bit value, and consume six bytes. bytenums = bytes_to_ints(input[:6]) codepoint = (((bytenums[1] & 0x0f) << 16) + ((bytenums[2] & 0x3f) << 10) + ((bytenums[4] & 0x0f) << 6) + (bytenums[5] & 0x3f) + 0x10000) return unichr(codepoint), 6 else: # This looked like a CESU-8 sequence, but it wasn't one. # 0xed indicates the start of a three-byte sequence, so give # three bytes to the superclass, so it can either decode them # as a surrogate codepoint (on Python 2) or handle the error # (on Python 3). return sup(input[:3], errors, False)
def _build_regexes(): """ ENCODING_REGEXES contain reasonably fast ways to detect if we could represent a given string in a given encoding. The simplest one is the 'ascii' detector, which of course just determines if all characters are between U+0000 and U+007F. """ # Define a regex that matches ASCII text. encoding_regexes = {'ascii': re.compile('^[\x00-\x7f]*$')} for encoding in CHARMAP_ENCODINGS: latin1table = ''.join(unichr(i) for i in range(128, 256)) charlist = latin1table.encode('latin-1').decode(encoding) # Build a regex from the ASCII range, followed by the decodings of # bytes 0x80-0xff in this character set. (This uses the fact that all # regex special characters are ASCII, and therefore won't appear in the # string.) regex = '^[\x00-\x7f{0}]*$'.format(charlist) encoding_regexes[encoding] = re.compile(regex) return encoding_regexes
def _build_regexes(): """ ENCODING_REGEXES contain reasonably fast ways to detect if we could represent a given string in a given encoding. The simplest one is the 'ascii' detector, which of course just determines if all characters are between U+0000 and U+007F. """ # Define a regex that matches ASCII text. encoding_regexes = {'ascii': re.compile('^[\x00-\x7f]*$')} for encoding in CHARMAP_ENCODINGS: latin1table = ''.join(unichr(i) for i in range(128, 256)) charlist = latin1table.encode('latin-1').decode(encoding) # Build a regex from the ASCII range, followed by the decodings of # bytes 0x80-0xff in this character set. (This uses the fact that all # regex special characters are ASCII, and therefore won't appear in the # string.) regex = '^[\x00-\x7f{}]*$'.format(charlist) encoding_regexes[encoding] = re.compile(regex) return encoding_regexes
def _build_utf8_punct_regex(): """ Recognize UTF-8 mojibake that's so blatant that we can fix it even when the rest of the string doesn't decode as UTF-8 -- namely, UTF-8 sequences for the 'General Punctuation' characters U+2000 to U+2040, re-encoded in Windows-1252. These are recognizable by the distinctive 'â€' ('\xe2\x80') sequence they all begin with when decoded as Windows-1252. """ # We're making a regex that has all the literal bytes from 0x80 to 0xbf in # a range. "Couldn't this have just said [\x80-\xbf]?", you might ask. # However, when we decode the regex as Windows-1252, the resulting # characters won't even be remotely contiguous. # # Unrelatedly, the expression that generates these bytes will be so much # prettier when we deprecate Python 2. continuation_char_list = ''.join( unichr(i) for i in range(0x80, 0xc0)).encode('latin-1') obvious_utf8 = ('â€[' + continuation_char_list.decode('sloppy-windows-1252') + ']') return re.compile(obvious_utf8)
def _build_utf8_punct_regex(): """ Recognize UTF-8 mojibake that's so blatant that we can fix it even when the rest of the string doesn't decode as UTF-8 -- namely, UTF-8 sequences for the u'General Punctuation' characters U+2000 to U+2040, re-encoded in Windows-1252. These are recognizable by the distinctiveu'â€u' ('\xe2\x80') sequence they all begin with when decoded as Windows-1252. """ # We're making a regex that has all the literal bytes from 0x80 to 0xbf in # a range. "Couldn't this have just said [\x80-\xbf]?", you might ask. # However, when we decode the regex as Windows-1252, the resulting # characters won't even be remotely contiguous. # # Unrelatedly, the expression that generates these bytes will be so much # prettier when we deprecate Python 2. continuation_char_list = ''.join( unichr(i) for i in range(0x80, 0xc0) ).encode(u'latin-1') obvious_utf8 = (u'â€[' + continuation_char_list.decode(u'sloppy-windows-1252') + u']') return re.compile(obvious_utf8)
def remove_bom(text): """ Remove a left-over byte-order mark. """ return text.lstrip(unichr(0xfeff))
def remove_bom(text): """ Remove a left-over byte-order mark. """ return text.lstrip(unichr(0xfeff))