Exemplo n.º 1
0
def check_phrase(text):
    eq_(fix_encoding(text), text)
    eq_(fix_encoding(text.encode('utf-8').decode('latin-1')), text)
    # make sure that the opening punctuation is not the only thing that makes
    # it work
    eq_(fix_encoding(text[1:]), text[1:])
    eq_(fix_encoding(text[1:].encode('utf-8').decode('latin-1')), text[1:])
Exemplo n.º 2
0
def check_phrase(text):
    # Check each valid phrase above, making sure that it doesn't get changed
    eq_(fix_encoding(text), text)
    eq_(fix_encoding(text.encode('utf-8').decode('latin-1')), text)

    # make sure that the opening punctuation is not the only thing that makes
    # it work
    eq_(fix_encoding(text[1:]), text[1:])
    eq_(fix_encoding(text[1:].encode('utf-8').decode('latin-1')), text[1:])
Exemplo n.º 3
0
def check_phrase(text):
    # Check each valid phrase above, making sure that it doesn't get changed
    eq_(fix_encoding(text), text)
    eq_(fix_encoding(text.encode('utf-8').decode('latin-1')), text)

    # make sure that the opening punctuation is not the only thing that makes
    # it work
    eq_(fix_encoding(text[1:]), text[1:])
    eq_(fix_encoding(text[1:].encode('utf-8').decode('latin-1')), text[1:])
Exemplo n.º 4
0
    def check_ftfy(self, text, encoding_only=True):
        """
        Given a single text input, check whether `ftfy.fix_text_encoding`
        would change it. If so, display the change.
        """
        self.count += 1
        text = unescape_html(text)
        if not possible_encoding(text, 'ascii'):
            if encoding_only:
                fixed = fix_encoding(text)
            else:
                fixed = fix_text(text,
                                 uncurl_quotes=False,
                                 fix_character_width=False)
            if text != fixed:
                # possibly filter common bots before printing
                print(u'\nText:\t{text!r}\nFixed:\t{fixed!r}\n'.format(
                    text=text, fixed=fixed))
                self.num_fixed += 1

        # Print status updates once in a while
        if self.count % 100 == 0:
            print('.', end='', flush=True)
        if self.count % 10000 == 0:
            print('\n%d/%d fixed' % (self.num_fixed, self.count))
Exemplo n.º 5
0
    def check_ftfy(self, text, encoding_only=True):
        """
        Given a single text input, check whether `ftfy.fix_text_encoding`
        would change it. If so, display the change.
        """
        self.count += 1
        text = unescape_html(text)
        if not possible_encoding(text, 'ascii'):
            if encoding_only:
                fixed = fix_encoding(text)
            else:
                fixed = fix_text(text, uncurl_quotes=False, fix_character_width=False)
            if text != fixed:
                # possibly filter common bots before printing
                print('\nText:\t{text!r}\nFixed:\t{fixed!r}\n'.format(
                    text=text, fixed=fixed
                ))
                self.num_fixed += 1
            elif 'â€' in text or '\x80' in text:
                print('\nNot fixed:\t{text!r}'.format(text=text))

        # Print status updates once in a while
        if self.count % 100 == 0:
            print('.', end='', flush=True)
        if self.count % 10000 == 0:
            print('\n%d/%d fixed' % (self.num_fixed, self.count))
def test_unknown_emoji():
    # The range we accept as emoji has gotten larger. Let's make sure we can
    # decode the futuristic emoji U+1F960, which will probably be a picture of
    # a fortune cookie in Unicode 10.0:
    emoji_text = "\U0001f960 I see emoji in your future"
    emojibake = emoji_text.encode('utf-8').decode('windows-1252')
    assert fix_encoding(emojibake) == emoji_text

    # We believe enough in the future of this codepoint that we'll even
    # recognize it with a mangled byte A0
    emojibake = emojibake.replace('\xa0', ' ')
    assert fix_encoding(emojibake) == emoji_text

    # Increment the first byte to get a very similar test case, but a
    # codepoint that will definitely not exist anytime soon. In this case,
    # we consider the existing text, "ñŸ¥\xa0", to be more probable.
    not_emoji = "\U0005f960 I see mojibake in your present".encode('utf-8').decode('windows-1252')
    assert fix_encoding(not_emoji) == not_emoji
Exemplo n.º 7
0
def test_unknown_emoji():
    # Make a string with two burritos in it. Python doesn't know about Unicode
    # burritos, but ftfy can guess they're probably emoji anyway.
    emoji_text = 'dos burritos: \U0001f32f\U0001f32f'

    # Mangle the burritos into a mess of Russian characters. (It would have
    # been great if we could have decoded them in cp437 instead, to turn them
    # into "DOS burritos", but the resulting string is one ftfy could already
    # fix.)
    emojibake = emoji_text.encode('utf-8').decode('windows-1251')

    # Restore the original text.
    eq_(fix_encoding(emojibake), emoji_text)

    # This doesn't happen if we replace the burritos with arbitrary unassigned
    # characters. The mangled text passes through as is.
    not_emoji = 'dos burritos: \U0003f32f\U0003f32f'.encode('utf-8').decode('windows-1251')
    eq_(fix_encoding(not_emoji), not_emoji)
Exemplo n.º 8
0
def test_unknown_emoji():
    # The range we accept as emoji has gotten larger. Let's make sure we can
    # decode the futuristic emoji U+1F960, which will probably be a picture of
    # a fortune cookie in Unicode 10.0:
    emoji_text = "\U0001f960 I see emoji in your future"
    emojibake = emoji_text.encode('utf-8').decode('windows-1252')
    eq_(fix_encoding(emojibake), emoji_text)

    # We believe enough in the future of this codepoint that we'll even
    # recognize it with a mangled byte A0
    emojibake = emojibake.replace('\xa0', ' ')
    eq_(fix_encoding(emojibake), emoji_text)

    # Increment the first byte to get a very similar test case, but a
    # codepoint that will definitely not exist anytime soon. In this case,
    # we consider the existing text, "ñŸ¥\xa0", to be more probable.
    not_emoji = "\U0005f960 I see mojibake in your present".encode('utf-8').decode('windows-1252')
    eq_(fix_encoding(not_emoji), not_emoji)
Exemplo n.º 9
0
def fix_text_segment(
    text,
    *,
    fix_entities='auto',
    remove_terminal_escapes=True,
    fix_encoding=True,
    fix_latin_ligatures=True,
    fix_character_width=True,
    uncurl_quotes=True,
    fix_line_breaks=True,
    fix_surrogates=True,
    remove_control_chars=True,
    remove_bom=True,
    normalization='NFC'
):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is in a consistent encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_encoding(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if fix_latin_ligatures:
            text = fixes.fix_latin_ligatures(text)
        if fix_character_width:
            text = fixes.fix_character_width(text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom and not remove_control_chars:
            # Skip this step if we've already done `remove_control_chars`,
            # because it would be redundant.
            text = fixes.remove_bom(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if text == origtext:
            return text
Exemplo n.º 10
0
def fix_text_segment(text,
                     fix_entities='auto',
                     remove_terminal_escapes=True,
                     fix_encoding=True,
                     fix_latin_ligatures=True,
                     fix_character_width=True,
                     uncurl_quotes=True,
                     fix_line_breaks=True,
                     fix_surrogates=True,
                     remove_control_chars=True,
                     remove_bom=True,
                     normalization='NFC'):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is in a consistent encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_encoding(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if fix_latin_ligatures:
            text = fixes.fix_latin_ligatures(text)
        if fix_character_width:
            text = fixes.fix_character_width(text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom and not remove_control_chars:
            # Skip this step if we've already done `remove_control_chars`,
            # because it would be redundant.
            text = fixes.remove_bom(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if text == origtext:
            return text
Exemplo n.º 11
0
def test_unknown_emoji():
    # Make a string with two burritos in it. Python doesn't know about Unicode
    # burritos, but ftfy can guess they're probably emoji anyway.
    emoji_text = 'dos burritos: \U0001f32f\U0001f32f'

    # Mangle the burritos into a mess of Russian characters. (It would have
    # been great if we could have decoded them in cp437 instead, to turn them
    # into "DOS burritos", but the resulting string is one ftfy could already
    # fix.)
    emojibake = emoji_text.encode('utf-8').decode('windows-1251')

    # Restore the original text.
    eq_(fix_encoding(emojibake), emoji_text)

    # This doesn't happen if we replace the burritos with arbitrary unassigned
    # characters. The mangled text passes through as is.
    not_emoji = 'dos burritos: \U0003f32f\U0003f32f'.encode('utf-8').decode(
        'windows-1251')
    eq_(fix_encoding(not_emoji), not_emoji)

    # The range we accept as emoji has gotten larger. Let's make sure we can
    # decode the futuristic emoji U+1F960, which will probably be a picture of
    # a fortune cookie in Unicode 10.0:
    emoji_text = "\U0001f960 I see emoji in your future"
    emojibake = emoji_text.encode('utf-8').decode('windows-1252')
    eq_(fix_encoding(emojibake), emoji_text)

    # We believe enough in the future of this codepoint that we'll even
    # recognize it with a mangled byte A0
    emojibake = emojibake.replace('\xa0', ' ')
    eq_(fix_encoding(emojibake), emoji_text)

    # Increment the first byte to get a very similar test case, but a
    # codepoint that will definitely not exist anytime soon. In this case,
    # we consider the existing text, "ñŸ¥\xa0", to be more probable.
    not_emoji = "\U0005f960 I see mojibake in your present".encode(
        'utf-8').decode('windows-1252')
    eq_(fix_encoding(not_emoji), not_emoji)
Exemplo n.º 12
0
def test_unknown_emoji():
    # Make a string with two burritos in it. Python doesn't know about Unicode
    # burritos, but ftfy can guess they're probably emoji anyway.
    emoji_text = 'dos burritos: \U0001f32f\U0001f32f'

    # Mangle the burritos into a mess of Russian characters. (It would have
    # been great if we could have decoded them in cp437 instead, to turn them
    # into "DOS burritos", but the resulting string is one ftfy could already
    # fix.)
    emojibake = emoji_text.encode('utf-8').decode('windows-1251')

    # Restore the original text.
    eq_(fix_encoding(emojibake), emoji_text)

    # This doesn't happen if we replace the burritos with arbitrary unassigned
    # characters. The mangled text passes through as is.
    not_emoji = 'dos burritos: \U0003f32f\U0003f32f'.encode('utf-8').decode('windows-1251')
    eq_(fix_encoding(not_emoji), not_emoji)

    # The range we accept as emoji has gotten larger. Let's make sure we can
    # decode the futuristic emoji U+1F960, which will probably be a picture of
    # a fortune cookie in Unicode 10.0:
    emoji_text = "\U0001f960 I see emoji in your future"
    emojibake = emoji_text.encode('utf-8').decode('windows-1252')
    eq_(fix_encoding(emojibake), emoji_text)

    # We believe enough in the future of this codepoint that we'll even
    # recognize it with a mangled byte A0
    emojibake = emojibake.replace('\xa0', ' ')
    eq_(fix_encoding(emojibake), emoji_text)

    # Increment the first byte to get a very similar test case, but a
    # codepoint that will definitely not exist anytime soon. In this case,
    # we consider the existing text, "ñŸ¥\xa0", to be more probable.
    not_emoji = "\U0005f960 I see mojibake in your present".encode('utf-8').decode('windows-1252')
    eq_(fix_encoding(not_emoji), not_emoji)
def test_unicode_11():
    # Unicode 11 has implemented the mtavruli form of the Georgian script.
    # They are analogous to capital letters in that they can be used to
    # emphasize text or write a headline.
    #
    # Python will convert to that form when running .upper() on Georgian text,
    # starting in version 3.7.0. We want to recognize the result as reasonable
    # text on all versions.
    #
    # This text is the mtavruli form of "ქართული ენა", meaning "Georgian
    # language".

    georgian_mtavruli_text = 'ᲥᲐᲠᲗᲣᲚᲘ ᲔᲜᲐ'
    assert sequence_weirdness(georgian_mtavruli_text) == 0

    mojibake = georgian_mtavruli_text.encode('utf-8').decode('sloppy-windows-1252')
    assert fix_encoding(mojibake) == georgian_mtavruli_text
Exemplo n.º 14
0
def test_unicode_11():
    # Unicode 11 has implemented the mtavruli form of the Georgian script.
    # They are analogous to capital letters in that they can be used to
    # emphasize text or write a headline.
    #
    # Python will convert to that form when running .upper() on Georgian text,
    # starting in version 3.7.0. We want to recognize the result as reasonable
    # text on all versions.
    #
    # This text is the mtavruli form of "ქართული ენა", meaning "Georgian
    # language".

    georgian_mtavruli_text = 'ᲥᲐᲠᲗᲣᲚᲘ ᲔᲜᲐ'
    assert sequence_weirdness(georgian_mtavruli_text) == 0

    mojibake = georgian_mtavruli_text.encode('utf-8').decode(
        'sloppy-windows-1252')
    assert fix_encoding(mojibake) == georgian_mtavruli_text
Exemplo n.º 15
0
def test_byte_order_mark():
    eq_(fix_encoding(''), '\ufeff')
Exemplo n.º 16
0
def test_byte_order_mark():
    assert fix_encoding('') == '\ufeff'
Exemplo n.º 17
0
from ftfy import fix_text, explain_unicode

print(fix_text('ünicode'))

print(fix_text('&lt;3'))

print(fix_text("&macr;\\_(ã\x83\x84)_/&macr;"))

len(fix_text(''))

explain_unicode('ノ( º _ ºノ) 테스트')

from ftfy.fixes import fix_encoding, unescape_html, uncurl_quotes, fix_line_breaks, decode_escapes

print(fix_encoding('â\x81”.'))

print(unescape_html('&lt;hr&gt;'))

print(uncurl_quotes('\u201ctest\u201d'))

print(fix_line_breaks("1. hello\u2028" "2. world"))

factoid = '\\u20a2'
print(decode_escapes(factoid))

from ftfy.formatting import character_width, display_center

print(character_width('A'))
print(character_width('가'))

lines = ['Display center', 'center']
Exemplo n.º 18
0
def test_lossy_utf8():
    eq_(fix_encoding('“lossy decodingâ€�'), '“lossy decoding�')
Exemplo n.º 19
0
def test_lossy_utf8():
    eq_(fix_encoding('“lossy decodingâ€�'), '“lossy decoding�')
Exemplo n.º 20
0
def test_fix_with_backslash():
    eq_(fix_encoding("<40\\% vs \xe2\x89\xa540\\%"), "<40\\% vs ≥40\\%")
Exemplo n.º 21
0
def test_mixed_utf8():
    eq_(fix_encoding('\xe2\x80\x9cmismatched quotes\x85\x94'), '“mismatched quotes…”')
    eq_(fix_encoding('“mismatched quotes…”'), '“mismatched quotes…”')
Exemplo n.º 22
0
def test_byte_order_mark():
    eq_(fix_encoding(''), '\ufeff')
Exemplo n.º 23
0
def test_byte_order_mark():
    assert fix_encoding('') == '\ufeff'