예제 #1
0
def fix_encoding_and_explain(text):
    """
    Re-decodes text that has been decoded incorrectly, and also return a
    "plan" indicating all the steps required to fix it.

    To fix similar text in the same way, without having to detect anything,
    you can use the ``apply_plan`` function.
    """
    best_version = text
    best_cost = text_cost(text)
    best_plan = []
    plan_so_far = []
    while True:
        prevtext = text
        text, plan = fix_one_step_and_explain(text)
        plan_so_far.extend(plan)
        cost = text_cost(text)

        # Add a penalty if we used a particularly obsolete encoding. The result
        # is that we won't use these encodings unless they can successfully
        # replace multiple characters.
        if ('encode', 'macroman') in plan_so_far or\
           ('encode', 'cp437') in plan_so_far:
            cost += 2

        # We need pretty solid evidence to decode from Windows-1251 (Cyrillic).
        if ('encode', 'sloppy-windows-1251') in plan_so_far:
            cost += 5

        if cost < best_cost:
            best_cost = cost
            best_version = text
            best_plan = list(plan_so_far)
        if text == prevtext:
            return best_version, best_plan
예제 #2
0
def fix_encoding_and_explain(text):
    """
    Re-decodes text that has been decoded incorrectly, and also return a
    "plan" indicating all the steps required to fix it.

    To fix similar text in the same way, without having to detect anything,
    you can use the ``apply_plan`` function.
    """
    best_version = text
    best_cost = text_cost(text)
    best_plan = []
    plan_so_far = []
    while True:
        prevtext = text
        text, plan = fix_one_step_and_explain(text)
        plan_so_far.extend(plan)
        cost = text_cost(text)

        # Add a penalty if we used a particularly obsolete encoding. The result
        # is that we won't use these encodings unless they can successfully
        # replace multiple characters.
        if ('encode', 'macroman') in plan_so_far or\
           ('encode', 'cp437') in plan_so_far:
            cost += 2

        # We need pretty solid evidence to decode from Windows-1251 (Cyrillic).
        if ('encode', 'sloppy-windows-1251') in plan_so_far:
            cost += 5

        if cost < best_cost:
            best_cost = cost
            best_version = text
            best_plan = list(plan_so_far)
        if text == prevtext:
            return best_version, best_plan
예제 #3
0
def fix_encoding_and_explain(text):
    """
    Re-decodes text that has been decoded incorrectly, and also return a
    "plan" indicating all the steps required to fix it.

    The resulting plan could be used with :func:`ftfy.fixes.apply_plan`
    to fix additional strings that are broken in the same way.
    """
    best_version = text
    best_cost = text_cost(text)
    best_plan = []
    plan_so_far = []
    while True:
        prevtext = text
        text, plan = fix_one_step_and_explain(text)
        plan_so_far.extend(plan)
        cost = text_cost(text)
        for _, _, step_cost in plan_so_far:
            cost += step_cost

        if cost < best_cost:
            best_cost = cost
            best_version = text
            best_plan = list(plan_so_far)
        if text == prevtext:
            return best_version, best_plan
예제 #4
0
def fix_encoding_and_explain(text):
    """
    Re-decodes text that has been decoded incorrectly, and also return a
    "plan" indicating all the steps required to fix it.

    The resulting plan could be used with :func:`ftfy.fixes.apply_plan`
    to fix additional strings that are broken in the same way.
    """
    best_version = text
    best_cost = text_cost(text)
    best_plan = []
    plan_so_far = []
    while True:
        prevtext = text
        text, plan = fix_one_step_and_explain(text)
        plan_so_far.extend(plan)
        cost = text_cost(text)
        for _, _, step_cost in plan_so_far:
            cost += step_cost

        if cost < best_cost:
            best_cost = cost
            best_version = text
            best_plan = list(plan_so_far)
        if text == prevtext:
            return best_version, best_plan
예제 #5
0
파일: fixes.py 프로젝트: mainka/python-ftfy
def fix_text_encoding(text):
    r"""
    Something you will find all over the place, in real-world text, is text
    that's mistakenly encoded as utf-8, decoded in some ugly format like
    latin-1 or even Windows codepage 1252, and encoded as utf-8 again.

    This causes your perfectly good Unicode-aware code to end up with garbage
    text because someone else (or maybe "someone else") made a mistake.

    This function looks for the evidence of that having happened and fixes it.
    It determines whether it should replace nonsense sequences of single-byte
    characters that were really meant to be UTF-8 characters, and if so, turns
    them into the correctly-encoded Unicode character that they were meant to
    represent.

    The input to the function must be Unicode. If you don't have Unicode text,
    you're not using the right tool to solve your problem.

    .. note::
        The following examples are written using unmarked literal strings,
        but they are Unicode text. In Python 2 we have "unicode_literals" turned
        on, and in Python 3 this is always the case.

    ftfy decodes text that looks like it was decoded incorrectly. It leaves
    alone text that doesn't.

        >>> print(fix_text_encoding('único'))
        único

        >>> print(fix_text_encoding('This text is fine already :þ'))
        This text is fine already :þ

    Because these characters often come from Microsoft products, we allow
    for the possibility that we get not just Unicode characters 128-255, but
    also Windows's conflicting idea of what characters 128-160 are.

        >>> print(fix_text_encoding('This — should be an em dash'))
        This — should be an em dash

    We might have to deal with both Windows characters and raw control
    characters at the same time, especially when dealing with characters like
    \x81 that have no mapping in Windows. This is a string that Python's
    standard `.encode` and `.decode` methods cannot correct.

        >>> print(fix_text_encoding('This text is sad .â\x81”.'))
        This text is sad .⁔.

    However, it has safeguards against fixing sequences of letters and
    punctuation that can occur in valid text:

        >>> print(fix_text_encoding('not such a fan of Charlotte Brontë…”'))
        not such a fan of Charlotte Brontë…”

    Cases of genuine ambiguity can sometimes be addressed by finding other
    characters that are not double-encoded, and expecting the encoding to
    be consistent:

        >>> print(fix_text_encoding('AHÅ™, the new sofa from IKEA®'))
        AHÅ™, the new sofa from IKEA®

    Finally, we handle the case where the text is in a single-byte encoding
    that was intended as Windows-1252 all along but read as Latin-1:

        >>> print(fix_text_encoding('This text was never UTF-8 at all\x85'))
        This text was never UTF-8 at all…

    The best version of the text is found using
    :func:`ftfy.badness.text_cost`.
    """
    best_version = text
    best_cost = text_cost(text)
    while True:
        prevtext = text
        text, plan = fix_text_and_explain(text)
        cost = text_cost(text)

        # Add a small penalty if we used a particularly obsolete encoding.
        if ('sloppy_encode', 'macroman') in plan or\
           ('sloppy_encode', 'cp437') in plan:
            cost += 1

        if cost < best_cost:
            best_cost = cost
            best_version = text
        if text == prevtext:
            return best_version