Exemplos de fix_encoding em Python, exemplos de ftfy.fixes.fix_encoding em Python

Exemplo n.º 1

0

Exibir arquivo

def check_phrase(text):
    eq_(fix_encoding(text), text)
    eq_(fix_encoding(text.encode('utf-8').decode('latin-1')), text)
    # make sure that the opening punctuation is not the only thing that makes
    # it work
    eq_(fix_encoding(text[1:]), text[1:])
    eq_(fix_encoding(text[1:].encode('utf-8').decode('latin-1')), text[1:])

Exemplo n.º 2

0

Exibir arquivo

def check_phrase(text):
    # Check each valid phrase above, making sure that it doesn't get changed
    eq_(fix_encoding(text), text)
    eq_(fix_encoding(text.encode('utf-8').decode('latin-1')), text)

    # make sure that the opening punctuation is not the only thing that makes
    # it work
    eq_(fix_encoding(text[1:]), text[1:])
    eq_(fix_encoding(text[1:].encode('utf-8').decode('latin-1')), text[1:])

Exemplo n.º 3

0

Exibir arquivo

def check_phrase(text):
    # Check each valid phrase above, making sure that it doesn't get changed
    eq_(fix_encoding(text), text)
    eq_(fix_encoding(text.encode('utf-8').decode('latin-1')), text)

    # make sure that the opening punctuation is not the only thing that makes
    # it work
    eq_(fix_encoding(text[1:]), text[1:])
    eq_(fix_encoding(text[1:].encode('utf-8').decode('latin-1')), text[1:])

Exemplo n.º 4

0

Exibir arquivo

    def check_ftfy(self, text, encoding_only=True):
        """
        Given a single text input, check whether `ftfy.fix_text_encoding`
        would change it. If so, display the change.
        """
        self.count += 1
        text = unescape_html(text)
        if not possible_encoding(text, 'ascii'):
            if encoding_only:
                fixed = fix_encoding(text)
            else:
                fixed = fix_text(text,
                                 uncurl_quotes=False,
                                 fix_character_width=False)
            if text != fixed:
                # possibly filter common bots before printing
                print(u'\nText:\t{text!r}\nFixed:\t{fixed!r}\n'.format(
                    text=text, fixed=fixed))
                self.num_fixed += 1

        # Print status updates once in a while
        if self.count % 100 == 0:
            print('.', end='', flush=True)
        if self.count % 10000 == 0:
            print('\n%d/%d fixed' % (self.num_fixed, self.count))

Exemplo n.º 5

0

Exibir arquivo

Arquivo: __init__.py Projeto: pannal/Sub-Zero.bundle

    def check_ftfy(self, text, encoding_only=True):
        """
        Given a single text input, check whether `ftfy.fix_text_encoding`
        would change it. If so, display the change.
        """
        self.count += 1
        text = unescape_html(text)
        if not possible_encoding(text, 'ascii'):
            if encoding_only:
                fixed = fix_encoding(text)
            else:
                fixed = fix_text(text, uncurl_quotes=False, fix_character_width=False)
            if text != fixed:
                # possibly filter common bots before printing
                print('\nText:\t{text!r}\nFixed:\t{fixed!r}\n'.format(
                    text=text, fixed=fixed
                ))
                self.num_fixed += 1
            elif 'â€' in text or '\x80' in text:
                print('\nNot fixed:\t{text!r}'.format(text=text))

        # Print status updates once in a while
        if self.count % 100 == 0:
            print('.', end='', flush=True)
        if self.count % 10000 == 0:
            print('\n%d/%d fixed' % (self.num_fixed, self.count))

Exemplo n.º 6

0

Exibir arquivo

Arquivo: test_futuristic_codepoints.py Projeto: LuminosoInsight/python-ftfy

def test_unknown_emoji():
    # The range we accept as emoji has gotten larger. Let's make sure we can
    # decode the futuristic emoji U+1F960, which will probably be a picture of
    # a fortune cookie in Unicode 10.0:
    emoji_text = "\U0001f960 I see emoji in your future"
    emojibake = emoji_text.encode('utf-8').decode('windows-1252')
    assert fix_encoding(emojibake) == emoji_text

    # We believe enough in the future of this codepoint that we'll even
    # recognize it with a mangled byte A0
    emojibake = emojibake.replace('\xa0', ' ')
    assert fix_encoding(emojibake) == emoji_text

    # Increment the first byte to get a very similar test case, but a
    # codepoint that will definitely not exist anytime soon. In this case,
    # we consider the existing text, "ñŸ¥\xa0", to be more probable.
    not_emoji = "\U0005f960 I see mojibake in your present".encode('utf-8').decode('windows-1252')
    assert fix_encoding(not_emoji) == not_emoji

Exemplo n.º 7

0

Exibir arquivo

def test_unknown_emoji():
    # Make a string with two burritos in it. Python doesn't know about Unicode
    # burritos, but ftfy can guess they're probably emoji anyway.
    emoji_text = 'dos burritos: \U0001f32f\U0001f32f'

    # Mangle the burritos into a mess of Russian characters. (It would have
    # been great if we could have decoded them in cp437 instead, to turn them
    # into "DOS burritos", but the resulting string is one ftfy could already
    # fix.)
    emojibake = emoji_text.encode('utf-8').decode('windows-1251')

    # Restore the original text.
    eq_(fix_encoding(emojibake), emoji_text)

    # This doesn't happen if we replace the burritos with arbitrary unassigned
    # characters. The mangled text passes through as is.
    not_emoji = 'dos burritos: \U0003f32f\U0003f32f'.encode('utf-8').decode('windows-1251')
    eq_(fix_encoding(not_emoji), not_emoji)

Exemplo n.º 8

0

Exibir arquivo

def test_unknown_emoji():
    # The range we accept as emoji has gotten larger. Let's make sure we can
    # decode the futuristic emoji U+1F960, which will probably be a picture of
    # a fortune cookie in Unicode 10.0:
    emoji_text = "\U0001f960 I see emoji in your future"
    emojibake = emoji_text.encode('utf-8').decode('windows-1252')
    eq_(fix_encoding(emojibake), emoji_text)

    # We believe enough in the future of this codepoint that we'll even
    # recognize it with a mangled byte A0
    emojibake = emojibake.replace('\xa0', ' ')
    eq_(fix_encoding(emojibake), emoji_text)

    # Increment the first byte to get a very similar test case, but a
    # codepoint that will definitely not exist anytime soon. In this case,
    # we consider the existing text, "ñŸ¥\xa0", to be more probable.
    not_emoji = "\U0005f960 I see mojibake in your present".encode('utf-8').decode('windows-1252')
    eq_(fix_encoding(not_emoji), not_emoji)

Exemplo n.º 9

0

Exibir arquivo

def fix_text_segment(
    text,
    *,
    fix_entities='auto',
    remove_terminal_escapes=True,
    fix_encoding=True,
    fix_latin_ligatures=True,
    fix_character_width=True,
    uncurl_quotes=True,
    fix_line_breaks=True,
    fix_surrogates=True,
    remove_control_chars=True,
    remove_bom=True,
    normalization='NFC'
):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is in a consistent encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_encoding(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if fix_latin_ligatures:
            text = fixes.fix_latin_ligatures(text)
        if fix_character_width:
            text = fixes.fix_character_width(text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom and not remove_control_chars:
            # Skip this step if we've already done `remove_control_chars`,
            # because it would be redundant.
            text = fixes.remove_bom(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if text == origtext:
            return text

Exemplo n.º 10

0

Exibir arquivo

Arquivo: __init__.py Projeto: rlugojr/python-ftfy

def fix_text_segment(text,
                     fix_entities='auto',
                     remove_terminal_escapes=True,
                     fix_encoding=True,
                     fix_latin_ligatures=True,
                     fix_character_width=True,
                     uncurl_quotes=True,
                     fix_line_breaks=True,
                     fix_surrogates=True,
                     remove_control_chars=True,
                     remove_bom=True,
                     normalization='NFC'):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is in a consistent encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_encoding(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if fix_latin_ligatures:
            text = fixes.fix_latin_ligatures(text)
        if fix_character_width:
            text = fixes.fix_character_width(text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom and not remove_control_chars:
            # Skip this step if we've already done `remove_control_chars`,
            # because it would be redundant.
            text = fixes.remove_bom(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if text == origtext:
            return text

Exemplo n.º 11

0

Exibir arquivo

Arquivo: test_synthetic.py Projeto: ranjeet-floyd/python-ftfy

def test_unknown_emoji():
    # Make a string with two burritos in it. Python doesn't know about Unicode
    # burritos, but ftfy can guess they're probably emoji anyway.
    emoji_text = 'dos burritos: \U0001f32f\U0001f32f'

    # Mangle the burritos into a mess of Russian characters. (It would have
    # been great if we could have decoded them in cp437 instead, to turn them
    # into "DOS burritos", but the resulting string is one ftfy could already
    # fix.)
    emojibake = emoji_text.encode('utf-8').decode('windows-1251')

    # Restore the original text.
    eq_(fix_encoding(emojibake), emoji_text)

    # This doesn't happen if we replace the burritos with arbitrary unassigned
    # characters. The mangled text passes through as is.
    not_emoji = 'dos burritos: \U0003f32f\U0003f32f'.encode('utf-8').decode(
        'windows-1251')
    eq_(fix_encoding(not_emoji), not_emoji)

    # The range we accept as emoji has gotten larger. Let's make sure we can
    # decode the futuristic emoji U+1F960, which will probably be a picture of
    # a fortune cookie in Unicode 10.0:
    emoji_text = "\U0001f960 I see emoji in your future"
    emojibake = emoji_text.encode('utf-8').decode('windows-1252')
    eq_(fix_encoding(emojibake), emoji_text)

    # We believe enough in the future of this codepoint that we'll even
    # recognize it with a mangled byte A0
    emojibake = emojibake.replace('\xa0', ' ')
    eq_(fix_encoding(emojibake), emoji_text)

    # Increment the first byte to get a very similar test case, but a
    # codepoint that will definitely not exist anytime soon. In this case,
    # we consider the existing text, "ñŸ¥\xa0", to be more probable.
    not_emoji = "\U0005f960 I see mojibake in your present".encode(
        'utf-8').decode('windows-1252')
    eq_(fix_encoding(not_emoji), not_emoji)

Exemplo n.º 12

0

Exibir arquivo

def test_unknown_emoji():
    # Make a string with two burritos in it. Python doesn't know about Unicode
    # burritos, but ftfy can guess they're probably emoji anyway.
    emoji_text = 'dos burritos: \U0001f32f\U0001f32f'

    # Mangle the burritos into a mess of Russian characters. (It would have
    # been great if we could have decoded them in cp437 instead, to turn them
    # into "DOS burritos", but the resulting string is one ftfy could already
    # fix.)
    emojibake = emoji_text.encode('utf-8').decode('windows-1251')

    # Restore the original text.
    eq_(fix_encoding(emojibake), emoji_text)

    # This doesn't happen if we replace the burritos with arbitrary unassigned
    # characters. The mangled text passes through as is.
    not_emoji = 'dos burritos: \U0003f32f\U0003f32f'.encode('utf-8').decode('windows-1251')
    eq_(fix_encoding(not_emoji), not_emoji)

    # The range we accept as emoji has gotten larger. Let's make sure we can
    # decode the futuristic emoji U+1F960, which will probably be a picture of
    # a fortune cookie in Unicode 10.0:
    emoji_text = "\U0001f960 I see emoji in your future"
    emojibake = emoji_text.encode('utf-8').decode('windows-1252')
    eq_(fix_encoding(emojibake), emoji_text)

    # We believe enough in the future of this codepoint that we'll even
    # recognize it with a mangled byte A0
    emojibake = emojibake.replace('\xa0', ' ')
    eq_(fix_encoding(emojibake), emoji_text)

    # Increment the first byte to get a very similar test case, but a
    # codepoint that will definitely not exist anytime soon. In this case,
    # we consider the existing text, "ñŸ¥\xa0", to be more probable.
    not_emoji = "\U0005f960 I see mojibake in your present".encode('utf-8').decode('windows-1252')
    eq_(fix_encoding(not_emoji), not_emoji)

Exemplo n.º 13

0

Exibir arquivo

Arquivo: test_futuristic_codepoints.py Projeto: LuminosoInsight/python-ftfy

def test_unicode_11():
    # Unicode 11 has implemented the mtavruli form of the Georgian script.
    # They are analogous to capital letters in that they can be used to
    # emphasize text or write a headline.
    #
    # Python will convert to that form when running .upper() on Georgian text,
    # starting in version 3.7.0. We want to recognize the result as reasonable
    # text on all versions.
    #
    # This text is the mtavruli form of "ქართული ენა", meaning "Georgian
    # language".

    georgian_mtavruli_text = 'ᲥᲐᲠᲗᲣᲚᲘ ᲔᲜᲐ'
    assert sequence_weirdness(georgian_mtavruli_text) == 0

    mojibake = georgian_mtavruli_text.encode('utf-8').decode('sloppy-windows-1252')
    assert fix_encoding(mojibake) == georgian_mtavruli_text

Exemplo n.º 14

0

Exibir arquivo

Arquivo: test_futuristic_codepoints.py Projeto: wxrui/python-ftfy

def test_unicode_11():
    # Unicode 11 has implemented the mtavruli form of the Georgian script.
    # They are analogous to capital letters in that they can be used to
    # emphasize text or write a headline.
    #
    # Python will convert to that form when running .upper() on Georgian text,
    # starting in version 3.7.0. We want to recognize the result as reasonable
    # text on all versions.
    #
    # This text is the mtavruli form of "ქართული ენა", meaning "Georgian
    # language".

    georgian_mtavruli_text = 'ᲥᲐᲠᲗᲣᲚᲘ ᲔᲜᲐ'
    assert sequence_weirdness(georgian_mtavruli_text) == 0

    mojibake = georgian_mtavruli_text.encode('utf-8').decode(
        'sloppy-windows-1252')
    assert fix_encoding(mojibake) == georgian_mtavruli_text

Exemplo n.º 15

0

Exibir arquivo

Arquivo: test_characters.py Projeto: rmax-contrib/python-ftfy

def test_byte_order_mark():
    eq_(fix_encoding('ï»¿'), '\ufeff')

Exemplo n.º 16

0

Exibir arquivo

def test_byte_order_mark():
    assert fix_encoding('ï»¿') == '\ufeff'

Exemplo n.º 17

0

Exibir arquivo

Arquivo: main.py Projeto: rheehot/learning-code

from ftfy import fix_text, explain_unicode

print(fix_text('uÌˆnicode'))

print(fix_text('&lt;3'))

print(fix_text("&macr;\\_(ã\x83\x84)_/&macr;"))

len(fix_text(''))

explain_unicode('ノ( º _ ºノ) 테스트')

from ftfy.fixes import fix_encoding, unescape_html, uncurl_quotes, fix_line_breaks, decode_escapes

print(fix_encoding('â\x81”.'))

print(unescape_html('&lt;hr&gt;'))

print(uncurl_quotes('\u201ctest\u201d'))

print(fix_line_breaks("1. hello\u2028" "2. world"))

factoid = '\\u20a2'
print(decode_escapes(factoid))

from ftfy.formatting import character_width, display_center

print(character_width('A'))
print(character_width('가'))

lines = ['Display center', 'center']

Exemplo n.º 18

0

Exibir arquivo

def test_lossy_utf8():
    eq_(fix_encoding('â€œlossy decodingâ€�'), '“lossy decoding�')

Exemplo n.º 19

0

Exibir arquivo

def test_lossy_utf8():
    eq_(fix_encoding('â€œlossy decodingâ€�'), '“lossy decoding�')

Exemplo n.º 20

0

Exibir arquivo

def test_fix_with_backslash():
    eq_(fix_encoding("<40\\% vs \xe2\x89\xa540\\%"), "<40\\% vs ≥40\\%")

Exemplo n.º 21

0

Exibir arquivo

def test_mixed_utf8():
    eq_(fix_encoding('\xe2\x80\x9cmismatched quotes\x85\x94'), '“mismatched quotes…”')
    eq_(fix_encoding('â€œmismatched quotesâ€¦”'), '“mismatched quotes…”')

Exemplo n.º 22

0

Exibir arquivo

Arquivo: test_characters.py Projeto: rlugojr/python-ftfy

def test_byte_order_mark():
    eq_(fix_encoding('ï»¿'), '\ufeff')

Exemplo n.º 23

0

Exibir arquivo

Arquivo: test_characters.py Projeto: LuminosoInsight/python-ftfy

def test_byte_order_mark():
    assert fix_encoding('ï»¿') == '\ufeff'