示例#1
0
文件: collect.py 项目: rspeer/charcol
 def check_ftfy(self, text):
     check_text = remove_unsafe_private_use(text).lower()
     if not possible_encoding(text, 'ascii') and 'unfollow' not in check_text:
         fixed = fix_text_encoding(text)
         if text != fixed:
             if not check_text.startswith('http://t.co/'):
                 print(u'Text:\t{text}\nFixed:\t{fixed}\n'.format(text=text, fixed=fixed))
             self.num_fixed += 1
示例#2
0
def fix_text_segment(text,
                     remove_unsafe_private_use=False,
                     fix_entities='auto',
                     remove_terminal_escapes=True,
                     fix_encoding=True,
                     normalization='NFKC',
                     uncurl_quotes=True,
                     fix_line_breaks=True,
                     fix_surrogates=True,
                     remove_control_chars=True,
                     remove_bom=True):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is all in the same encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_unsafe_private_use:
            text = fixes.remove_unsafe_private_use(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_text_encoding(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom:
            text = fixes.remove_bom(text)
        if text == origtext:
            return text
示例#3
0
def fix_text_segment(text,
                     remove_unsafe_private_use=False,
                     fix_entities='auto',
                     remove_terminal_escapes=True,
                     fix_encoding=True,
                     normalization='NFKC',
                     uncurl_quotes=True,
                     fix_line_breaks=True,
                     fix_surrogates=True,
                     remove_control_chars=True,
                     remove_bom=True):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is all in the same encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_unsafe_private_use:
            text = fixes.remove_unsafe_private_use(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_text_encoding(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom:
            text = fixes.remove_bom(text)
        if text == origtext:
            return text
示例#4
0
def standardize_word(word):
    u"""
    Apply various normalizations to the text. In languages where this is
    relevant, it will end up in all lowercase letters, with pre-composed
    diacritics.

    Some language-specific gotchas:

    - Words ending with a capital "Σ" in Greek have a lowercase version that
      ends with "ς" on Python 3, but "σ" on Python 2. (Python 3 is
      orthographically correct.) This will lead to different frequencies on
      such Greek words, and different numbers of words in total.

    - Words containing a capital "I" in Turkish will be normalized to a
      lowercase "i", incorrectly, instead of "ı". The effective result is
      that the capitalized versions will not share a word count with the
      lowercase versions.
    """
    return normalize('NFKC', remove_unsafe_private_use(word)).lower()
示例#5
0
def render_safe(text):
    '''
    Make sure the given text is safe to pass to an external process.
    '''
    return remove_control_chars(remove_unsafe_private_use(text))
示例#6
0
def render_safe(text):
    '''
    Make sure the given text is safe to pass to an external process.
    '''
    return remove_control_chars(remove_unsafe_private_use(text))