def check_phrase(text): eq_(fix_encoding(text), text) eq_(fix_encoding(text.encode('utf-8').decode('latin-1')), text) # make sure that the opening punctuation is not the only thing that makes # it work eq_(fix_encoding(text[1:]), text[1:]) eq_(fix_encoding(text[1:].encode('utf-8').decode('latin-1')), text[1:])
def check_phrase(text): # Check each valid phrase above, making sure that it doesn't get changed eq_(fix_encoding(text), text) eq_(fix_encoding(text.encode('utf-8').decode('latin-1')), text) # make sure that the opening punctuation is not the only thing that makes # it work eq_(fix_encoding(text[1:]), text[1:]) eq_(fix_encoding(text[1:].encode('utf-8').decode('latin-1')), text[1:])
def check_phrase(text): # Check each valid phrase above, making sure that it doesn't get changed eq_(fix_encoding(text), text) eq_(fix_encoding(text.encode('utf-8').decode('latin-1')), text) # make sure that the opening punctuation is not the only thing that makes # it work eq_(fix_encoding(text[1:]), text[1:]) eq_(fix_encoding(text[1:].encode('utf-8').decode('latin-1')), text[1:])
def check_ftfy(self, text, encoding_only=True): """ Given a single text input, check whether `ftfy.fix_text_encoding` would change it. If so, display the change. """ self.count += 1 text = unescape_html(text) if not possible_encoding(text, 'ascii'): if encoding_only: fixed = fix_encoding(text) else: fixed = fix_text(text, uncurl_quotes=False, fix_character_width=False) if text != fixed: # possibly filter common bots before printing print(u'\nText:\t{text!r}\nFixed:\t{fixed!r}\n'.format( text=text, fixed=fixed)) self.num_fixed += 1 # Print status updates once in a while if self.count % 100 == 0: print('.', end='', flush=True) if self.count % 10000 == 0: print('\n%d/%d fixed' % (self.num_fixed, self.count))
def check_ftfy(self, text, encoding_only=True): """ Given a single text input, check whether `ftfy.fix_text_encoding` would change it. If so, display the change. """ self.count += 1 text = unescape_html(text) if not possible_encoding(text, 'ascii'): if encoding_only: fixed = fix_encoding(text) else: fixed = fix_text(text, uncurl_quotes=False, fix_character_width=False) if text != fixed: # possibly filter common bots before printing print('\nText:\t{text!r}\nFixed:\t{fixed!r}\n'.format( text=text, fixed=fixed )) self.num_fixed += 1 elif 'â€' in text or '\x80' in text: print('\nNot fixed:\t{text!r}'.format(text=text)) # Print status updates once in a while if self.count % 100 == 0: print('.', end='', flush=True) if self.count % 10000 == 0: print('\n%d/%d fixed' % (self.num_fixed, self.count))
def test_unknown_emoji(): # The range we accept as emoji has gotten larger. Let's make sure we can # decode the futuristic emoji U+1F960, which will probably be a picture of # a fortune cookie in Unicode 10.0: emoji_text = "\U0001f960 I see emoji in your future" emojibake = emoji_text.encode('utf-8').decode('windows-1252') assert fix_encoding(emojibake) == emoji_text # We believe enough in the future of this codepoint that we'll even # recognize it with a mangled byte A0 emojibake = emojibake.replace('\xa0', ' ') assert fix_encoding(emojibake) == emoji_text # Increment the first byte to get a very similar test case, but a # codepoint that will definitely not exist anytime soon. In this case, # we consider the existing text, "ñŸ¥\xa0", to be more probable. not_emoji = "\U0005f960 I see mojibake in your present".encode('utf-8').decode('windows-1252') assert fix_encoding(not_emoji) == not_emoji
def test_unknown_emoji(): # Make a string with two burritos in it. Python doesn't know about Unicode # burritos, but ftfy can guess they're probably emoji anyway. emoji_text = 'dos burritos: \U0001f32f\U0001f32f' # Mangle the burritos into a mess of Russian characters. (It would have # been great if we could have decoded them in cp437 instead, to turn them # into "DOS burritos", but the resulting string is one ftfy could already # fix.) emojibake = emoji_text.encode('utf-8').decode('windows-1251') # Restore the original text. eq_(fix_encoding(emojibake), emoji_text) # This doesn't happen if we replace the burritos with arbitrary unassigned # characters. The mangled text passes through as is. not_emoji = 'dos burritos: \U0003f32f\U0003f32f'.encode('utf-8').decode('windows-1251') eq_(fix_encoding(not_emoji), not_emoji)
def test_unknown_emoji(): # The range we accept as emoji has gotten larger. Let's make sure we can # decode the futuristic emoji U+1F960, which will probably be a picture of # a fortune cookie in Unicode 10.0: emoji_text = "\U0001f960 I see emoji in your future" emojibake = emoji_text.encode('utf-8').decode('windows-1252') eq_(fix_encoding(emojibake), emoji_text) # We believe enough in the future of this codepoint that we'll even # recognize it with a mangled byte A0 emojibake = emojibake.replace('\xa0', ' ') eq_(fix_encoding(emojibake), emoji_text) # Increment the first byte to get a very similar test case, but a # codepoint that will definitely not exist anytime soon. In this case, # we consider the existing text, "ñŸ¥\xa0", to be more probable. not_emoji = "\U0005f960 I see mojibake in your present".encode('utf-8').decode('windows-1252') eq_(fix_encoding(not_emoji), not_emoji)
def fix_text_segment( text, *, fix_entities='auto', remove_terminal_escapes=True, fix_encoding=True, fix_latin_ligatures=True, fix_character_width=True, uncurl_quotes=True, fix_line_breaks=True, fix_surrogates=True, remove_control_chars=True, remove_bom=True, normalization='NFC' ): """ Apply fixes to text in a single chunk. This could be a line of text within a larger run of `fix_text`, or it could be a larger amount of text that you are certain is in a consistent encoding. See `fix_text` for a description of the parameters. """ if isinstance(text, bytes): raise UnicodeError(fixes.BYTES_ERROR_TEXT) if fix_entities == 'auto' and '<' in text and '>' in text: fix_entities = False while True: origtext = text if remove_terminal_escapes: text = fixes.remove_terminal_escapes(text) if fix_encoding: text = fixes.fix_encoding(text) if fix_entities: text = fixes.unescape_html(text) if fix_latin_ligatures: text = fixes.fix_latin_ligatures(text) if fix_character_width: text = fixes.fix_character_width(text) if uncurl_quotes: text = fixes.uncurl_quotes(text) if fix_line_breaks: text = fixes.fix_line_breaks(text) if fix_surrogates: text = fixes.fix_surrogates(text) if remove_control_chars: text = fixes.remove_control_chars(text) if remove_bom and not remove_control_chars: # Skip this step if we've already done `remove_control_chars`, # because it would be redundant. text = fixes.remove_bom(text) if normalization is not None: text = unicodedata.normalize(normalization, text) if text == origtext: return text
def fix_text_segment(text, fix_entities='auto', remove_terminal_escapes=True, fix_encoding=True, fix_latin_ligatures=True, fix_character_width=True, uncurl_quotes=True, fix_line_breaks=True, fix_surrogates=True, remove_control_chars=True, remove_bom=True, normalization='NFC'): """ Apply fixes to text in a single chunk. This could be a line of text within a larger run of `fix_text`, or it could be a larger amount of text that you are certain is in a consistent encoding. See `fix_text` for a description of the parameters. """ if isinstance(text, bytes): raise UnicodeError(fixes.BYTES_ERROR_TEXT) if fix_entities == 'auto' and '<' in text and '>' in text: fix_entities = False while True: origtext = text if remove_terminal_escapes: text = fixes.remove_terminal_escapes(text) if fix_encoding: text = fixes.fix_encoding(text) if fix_entities: text = fixes.unescape_html(text) if fix_latin_ligatures: text = fixes.fix_latin_ligatures(text) if fix_character_width: text = fixes.fix_character_width(text) if uncurl_quotes: text = fixes.uncurl_quotes(text) if fix_line_breaks: text = fixes.fix_line_breaks(text) if fix_surrogates: text = fixes.fix_surrogates(text) if remove_control_chars: text = fixes.remove_control_chars(text) if remove_bom and not remove_control_chars: # Skip this step if we've already done `remove_control_chars`, # because it would be redundant. text = fixes.remove_bom(text) if normalization is not None: text = unicodedata.normalize(normalization, text) if text == origtext: return text
def test_unknown_emoji(): # Make a string with two burritos in it. Python doesn't know about Unicode # burritos, but ftfy can guess they're probably emoji anyway. emoji_text = 'dos burritos: \U0001f32f\U0001f32f' # Mangle the burritos into a mess of Russian characters. (It would have # been great if we could have decoded them in cp437 instead, to turn them # into "DOS burritos", but the resulting string is one ftfy could already # fix.) emojibake = emoji_text.encode('utf-8').decode('windows-1251') # Restore the original text. eq_(fix_encoding(emojibake), emoji_text) # This doesn't happen if we replace the burritos with arbitrary unassigned # characters. The mangled text passes through as is. not_emoji = 'dos burritos: \U0003f32f\U0003f32f'.encode('utf-8').decode( 'windows-1251') eq_(fix_encoding(not_emoji), not_emoji) # The range we accept as emoji has gotten larger. Let's make sure we can # decode the futuristic emoji U+1F960, which will probably be a picture of # a fortune cookie in Unicode 10.0: emoji_text = "\U0001f960 I see emoji in your future" emojibake = emoji_text.encode('utf-8').decode('windows-1252') eq_(fix_encoding(emojibake), emoji_text) # We believe enough in the future of this codepoint that we'll even # recognize it with a mangled byte A0 emojibake = emojibake.replace('\xa0', ' ') eq_(fix_encoding(emojibake), emoji_text) # Increment the first byte to get a very similar test case, but a # codepoint that will definitely not exist anytime soon. In this case, # we consider the existing text, "ñŸ¥\xa0", to be more probable. not_emoji = "\U0005f960 I see mojibake in your present".encode( 'utf-8').decode('windows-1252') eq_(fix_encoding(not_emoji), not_emoji)
def test_unknown_emoji(): # Make a string with two burritos in it. Python doesn't know about Unicode # burritos, but ftfy can guess they're probably emoji anyway. emoji_text = 'dos burritos: \U0001f32f\U0001f32f' # Mangle the burritos into a mess of Russian characters. (It would have # been great if we could have decoded them in cp437 instead, to turn them # into "DOS burritos", but the resulting string is one ftfy could already # fix.) emojibake = emoji_text.encode('utf-8').decode('windows-1251') # Restore the original text. eq_(fix_encoding(emojibake), emoji_text) # This doesn't happen if we replace the burritos with arbitrary unassigned # characters. The mangled text passes through as is. not_emoji = 'dos burritos: \U0003f32f\U0003f32f'.encode('utf-8').decode('windows-1251') eq_(fix_encoding(not_emoji), not_emoji) # The range we accept as emoji has gotten larger. Let's make sure we can # decode the futuristic emoji U+1F960, which will probably be a picture of # a fortune cookie in Unicode 10.0: emoji_text = "\U0001f960 I see emoji in your future" emojibake = emoji_text.encode('utf-8').decode('windows-1252') eq_(fix_encoding(emojibake), emoji_text) # We believe enough in the future of this codepoint that we'll even # recognize it with a mangled byte A0 emojibake = emojibake.replace('\xa0', ' ') eq_(fix_encoding(emojibake), emoji_text) # Increment the first byte to get a very similar test case, but a # codepoint that will definitely not exist anytime soon. In this case, # we consider the existing text, "ñŸ¥\xa0", to be more probable. not_emoji = "\U0005f960 I see mojibake in your present".encode('utf-8').decode('windows-1252') eq_(fix_encoding(not_emoji), not_emoji)
def test_unicode_11(): # Unicode 11 has implemented the mtavruli form of the Georgian script. # They are analogous to capital letters in that they can be used to # emphasize text or write a headline. # # Python will convert to that form when running .upper() on Georgian text, # starting in version 3.7.0. We want to recognize the result as reasonable # text on all versions. # # This text is the mtavruli form of "ქართული ენა", meaning "Georgian # language". georgian_mtavruli_text = 'ᲥᲐᲠᲗᲣᲚᲘ ᲔᲜᲐ' assert sequence_weirdness(georgian_mtavruli_text) == 0 mojibake = georgian_mtavruli_text.encode('utf-8').decode('sloppy-windows-1252') assert fix_encoding(mojibake) == georgian_mtavruli_text
def test_unicode_11(): # Unicode 11 has implemented the mtavruli form of the Georgian script. # They are analogous to capital letters in that they can be used to # emphasize text or write a headline. # # Python will convert to that form when running .upper() on Georgian text, # starting in version 3.7.0. We want to recognize the result as reasonable # text on all versions. # # This text is the mtavruli form of "ქართული ენა", meaning "Georgian # language". georgian_mtavruli_text = 'ᲥᲐᲠᲗᲣᲚᲘ ᲔᲜᲐ' assert sequence_weirdness(georgian_mtavruli_text) == 0 mojibake = georgian_mtavruli_text.encode('utf-8').decode( 'sloppy-windows-1252') assert fix_encoding(mojibake) == georgian_mtavruli_text
def test_byte_order_mark(): eq_(fix_encoding(''), '\ufeff')
def test_byte_order_mark(): assert fix_encoding('') == '\ufeff'
from ftfy import fix_text, explain_unicode print(fix_text('ünicode')) print(fix_text('<3')) print(fix_text("¯\\_(ã\x83\x84)_/¯")) len(fix_text('')) explain_unicode('ノ( º _ ºノ) 테스트') from ftfy.fixes import fix_encoding, unescape_html, uncurl_quotes, fix_line_breaks, decode_escapes print(fix_encoding('â\x81”.')) print(unescape_html('<hr>')) print(uncurl_quotes('\u201ctest\u201d')) print(fix_line_breaks("1. hello\u2028" "2. world")) factoid = '\\u20a2' print(decode_escapes(factoid)) from ftfy.formatting import character_width, display_center print(character_width('A')) print(character_width('가')) lines = ['Display center', 'center']
def test_lossy_utf8(): eq_(fix_encoding('“lossy decodingâ€�'), '“lossy decoding�')
def test_lossy_utf8(): eq_(fix_encoding('“lossy decodingâ€�'), '“lossy decoding�')
def test_fix_with_backslash(): eq_(fix_encoding("<40\\% vs \xe2\x89\xa540\\%"), "<40\\% vs ≥40\\%")
def test_mixed_utf8(): eq_(fix_encoding('\xe2\x80\x9cmismatched quotes\x85\x94'), '“mismatched quotes…”') eq_(fix_encoding('“mismatched quotes…”'), '“mismatched quotes…”')
def test_byte_order_mark(): eq_(fix_encoding(''), '\ufeff')
def test_byte_order_mark(): assert fix_encoding('') == '\ufeff'