def fix_partial_utf8_punct_in_1252(text): """ Fix particular characters that seem to be found in the wild encoded in UTF-8 and decoded in Latin-1 or Windows-1252, even when this fix can't be consistently applied. This is used as a step within `fix_encoding`. For this function, we assume the text has been decoded in Windows-1252. If it was decoded in Latin-1, we'll call this right after it goes through the Latin-1-to-Windows-1252 fixer. """ def replacement(match): "The function to apply when this regex matches." return match.group(0).encode('sloppy-windows-1252').decode('utf-8') return PARTIAL_UTF8_PUNCT_RE.sub(replacement, text)
def fix_partial_utf8_punct_in_1252(text): """ Fix particular characters that seem to be found in the wild encoded in UTF-8 and decoded in Latin-1 or Windows-1252, even when this fix can't be consistently applied. One form of inconsistency we need to deal with is that some character might be from the Latin-1 C1 control character set, while others are from the set of characters that take their place in Windows-1252. So we first replace those characters, then apply a fix that only works on Windows-1252 characters. This is used as a transcoder within `fix_encoding`. """ def latin1_to_w1252(match): "The function to apply when this regex matches." return match.group(0).encode('latin-1').decode('sloppy-windows-1252') def w1252_to_utf8(match): "The function to apply when this regex matches." return match.group(0).encode('sloppy-windows-1252').decode('utf-8') text = C1_CONTROL_RE.sub(latin1_to_w1252, text) return PARTIAL_UTF8_PUNCT_RE.sub(w1252_to_utf8, text)
def fix_one_step_and_explain(text): """ Performs a single step of re-decoding text that's been decoded incorrectly. Returns the decoded text, plus a "plan" for how to reproduce what it did. """ if isinstance(text, bytes): raise UnicodeError(BYTES_ERROR_TEXT) if len(text) == 0: return text, [] # The first plan is to return ASCII text unchanged. if possible_encoding(text, 'ascii'): return text, [] # As we go through the next step, remember the possible encodings # that we encounter but don't successfully fix yet. We may need them # later. possible_1byte_encodings = [] # Suppose the text was supposed to be UTF-8, but it was decoded using # a single-byte encoding instead. When these cases can be fixed, they # are usually the correct thing to do, so try them next. for encoding in CHARMAP_ENCODINGS: if possible_encoding(text, encoding): encoded_bytes = text.encode(encoding) encode_step = ('encode', encoding, ENCODING_COSTS.get(encoding, 0)) transcode_steps = [] # Now, find out if it's UTF-8 (or close enough). Otherwise, # remember the encoding for later. try: decoding = 'utf-8' # Check encoded_bytes for sequences that would be UTF-8, # except they have b' ' where b'\xa0' would belong. if ALTERED_UTF8_RE.search(encoded_bytes): encoded_bytes = restore_byte_a0(encoded_bytes) cost = encoded_bytes.count(0xa0) * 2 transcode_steps.append(('transcode', 'restore_byte_a0', cost)) # Check for the byte 0x1a, which indicates where one of our # 'sloppy' codecs found a replacement character. if encoding.startswith('sloppy') and 0x1a in encoded_bytes: encoded_bytes = replace_lossy_sequences(encoded_bytes) transcode_steps.append(('transcode', 'replace_lossy_sequences', 0)) if 0xed in encoded_bytes or 0xc0 in encoded_bytes: decoding = 'utf-8-variants' decode_step = ('decode', decoding, 0) steps = [encode_step] + transcode_steps + [decode_step] fixed = encoded_bytes.decode(decoding) return fixed, steps except UnicodeDecodeError: possible_1byte_encodings.append(encoding) # Look for a-hat-euro sequences that remain, and fix them in isolation. if PARTIAL_UTF8_PUNCT_RE.search(text): steps = [('transcode', 'fix_partial_utf8_punct_in_1252', 1)] fixed = fix_partial_utf8_punct_in_1252(text) return fixed, steps # The next most likely case is that this is Latin-1 that was intended to # be read as Windows-1252, because those two encodings in particular are # easily confused. if 'latin-1' in possible_1byte_encodings: if 'windows-1252' in possible_1byte_encodings: # This text is in the intersection of Latin-1 and # Windows-1252, so it's probably legit. return text, [] else: # Otherwise, it means we have characters that are in Latin-1 but # not in Windows-1252. Those are C1 control characters. Nobody # wants those. Assume they were meant to be Windows-1252. Don't # use the sloppy codec, because bad Windows-1252 characters are # a bad sign. encoded = text.encode('latin-1') try: fixed = encoded.decode('windows-1252') steps = [] if fixed != text: steps = [('encode', 'latin-1', 0), ('decode', 'windows-1252', 1)] return fixed, steps except UnicodeDecodeError: # This text contained characters that don't even make sense # if you assume they were supposed to be Windows-1252. In # that case, let's not assume anything. pass # The cases that remain are mixups between two different single-byte # encodings, and not the common case of Latin-1 vs. Windows-1252. # # These cases may be unsolvable without adding false positives, though # I have vague ideas about how to optionally address them in the future. # Return the text unchanged; the plan is empty. return text, []
def fix_one_step_and_explain(text): """ Performs a single step of re-decoding text that's been decoded incorrectly. Returns the decoded text, plus a "plan" for how to reproduce what it did. """ if isinstance(text, bytes): raise UnicodeError(BYTES_ERROR_TEXT) if len(text) == 0: return text, [] # The first plan is to return ASCII text unchanged. if possible_encoding(text, 'ascii'): return text, [] # As we go through the next step, remember the possible encodings # that we encounter but don't successfully fix yet. We may need them # later. possible_1byte_encodings = [] # Suppose the text was supposed to be UTF-8, but it was decoded using # a single-byte encoding instead. When these cases can be fixed, they # are usually the correct thing to do, so try them next. for encoding in CHARMAP_ENCODINGS: if possible_encoding(text, encoding): encoded_bytes = text.encode(encoding) encode_step = ('encode', encoding, ENCODING_COSTS.get(encoding, 0)) transcode_steps = [] # Now, find out if it's UTF-8 (or close enough). Otherwise, # remember the encoding for later. try: decoding = 'utf-8' # Check encoded_bytes for sequences that would be UTF-8, # except they have b' ' where b'\xa0' would belong. if ALTERED_UTF8_RE.search(encoded_bytes): encoded_bytes = restore_byte_a0(encoded_bytes) cost = encoded_bytes.count(0xa0) transcode_steps.append( ('transcode', 'restore_byte_a0', cost)) # Check for the byte 0x1a, which indicates where one of our # 'sloppy' codecs found a replacement character. if encoding.startswith('sloppy') and 0x1a in encoded_bytes: encoded_bytes = replace_lossy_sequences(encoded_bytes) transcode_steps.append( ('transcode', 'replace_lossy_sequences', 0)) if 0xed in encoded_bytes or 0xc0 in encoded_bytes: decoding = 'utf-8-variants' decode_step = ('decode', decoding, 0) steps = [encode_step] + transcode_steps + [decode_step] fixed = encoded_bytes.decode(decoding) return fixed, steps except UnicodeDecodeError: possible_1byte_encodings.append(encoding) # Look for a-hat-euro sequences that remain, and fix them in isolation. if PARTIAL_UTF8_PUNCT_RE.search(text): steps = [('transcode', 'fix_partial_utf8_punct_in_1252', 1)] fixed = fix_partial_utf8_punct_in_1252(text) return fixed, steps # The next most likely case is that this is Latin-1 that was intended to # be read as Windows-1252, because those two encodings in particular are # easily confused. if 'latin-1' in possible_1byte_encodings: if 'windows-1252' in possible_1byte_encodings: # This text is in the intersection of Latin-1 and # Windows-1252, so it's probably legit. return text, [] else: # Otherwise, it means we have characters that are in Latin-1 but # not in Windows-1252. Those are C1 control characters. Nobody # wants those. Assume they were meant to be Windows-1252. Don't # use the sloppy codec, because bad Windows-1252 characters are # a bad sign. encoded = text.encode('latin-1') try: fixed = encoded.decode('windows-1252') steps = [] if fixed != text: steps = [('encode', 'latin-1', 0), ('decode', 'windows-1252', 1)] return fixed, steps except UnicodeDecodeError: # This text contained characters that don't even make sense # if you assume they were supposed to be Windows-1252. In # that case, let's not assume anything. pass # The cases that remain are mixups between two different single-byte # encodings, and not the common case of Latin-1 vs. Windows-1252. # # These cases may be unsolvable without adding false positives, though # I have vague ideas about how to optionally address them in the future. # Return the text unchanged; the plan is empty. return text, []