Exemplo n.º 1
0
def cld2_surface_tokenizer(text, mode='twitter'):
    """
    Uses CLD2 to detect the language and wordfreq tokenizer to create tokens.

    The `mode` can be 'twitter' or 'reddit', which slightly changes the
    pre-processing of the text.
    """
    text = unescape_html(text)
    if mode == 'twitter':
        text = TWITTER_HANDLE_RE.sub('', text)
        text = TCO_RE.sub('', text)
    elif mode == 'reddit':
        text = URL_RE.sub('', text)
        text = MARKDOWN_URL_RESIDUE_RE.sub(']', text)

    lang = cld2_detect_language(text)

    # If the detected language isn't in our pretty generous list of languages,
    # return no tokens.
    if lang not in KEEP_THESE_LANGUAGES:
        return 'xx', []

    # cld2's accuracy seems to improve dramatically with at least 50
    # bytes of input, so throw away non-English below this length.
    if len(text.encode('utf-8')) < 50 and lang != 'en':
        return 'xx', []

    tokens = tokenize(text, lang)
    return lang, tokens
Exemplo n.º 2
0
    def check_ftfy(self, text, encoding_only=True):
        """
        Given a single text input, check whether `ftfy.fix_text_encoding`
        would change it. If so, display the change.
        """
        self.count += 1
        text = unescape_html(text)
        if not possible_encoding(text, 'ascii'):
            if encoding_only:
                fixed = fix_encoding(text)
            else:
                fixed = fix_text(text, uncurl_quotes=False, fix_character_width=False)
            if text != fixed:
                # possibly filter common bots before printing
                print('\nText:\t{text!r}\nFixed:\t{fixed!r}\n'.format(
                    text=text, fixed=fixed
                ))
                self.num_fixed += 1
            elif 'â€' in text or '\x80' in text:
                print('\nNot fixed:\t{text!r}'.format(text=text))

        # Print status updates once in a while
        if self.count % 100 == 0:
            print('.', end='', flush=True)
        if self.count % 10000 == 0:
            print('\n%d/%d fixed' % (self.num_fixed, self.count))
Exemplo n.º 3
0
def preprocess_reddit(infile, outfile):
    """
    Read Reddit text from a JSON-lines file, parse the Markdown, and tag
    what language each post is in.

    Filter the posts to enforce _some_ standard of quality:

    - Posts in English should have score >= 2 (they should have net upvotes)
    - Other posts should have score >= 1 (no net downvotes)
    - Posts from subreddits that are banned in 2018 are skipped
    """
    for line in infile:
        data = json.loads(line)
        if ('score' in data and 'body' in data and data["score"] is not None
                and data["score"] >= 1 and data["body"] != "[deleted]"):
            subreddit = data["subreddit"]
            subreddit_hash = mmh3.hash(subreddit)
            if subreddit_hash not in BANNED_SUBREDDITS:
                md = fix_surrogates(
                    unescape_html(fix_line_breaks(data["body"])))
                text = strip_markdown(md)
                text = text.replace("\n", " ").replace("\u200b", "")
                text = URL_RE.sub("", text)
                if text:
                    lang, confident = detect_language(text)
                    if confident:
                        # There are more English posts than we need, so filter them
                        # for score >= 2
                        if lang != "en" or data["score"] > 1:
                            print(f"{lang}\t{text}", file=outfile)
Exemplo n.º 4
0
def cld2_surface_tokenizer(text, mode='twitter'):
    """
    Uses CLD2 to detect the language and wordfreq tokenizer to create tokens.

    The `mode` can be 'twitter' or 'reddit', which slightly changes the
    pre-processing of the text.
    """
    text = unescape_html(text)
    if mode == 'twitter':
        text = TWITTER_HANDLE_RE.sub('', text)
        text = TCO_RE.sub('', text)
    elif mode == 'reddit':
        text = URL_RE.sub('', text)
        text = MARKDOWN_URL_RESIDUE_RE.sub(']', text)

    lang = cld2_detect_language(text)

    # If the detected language isn't in our pretty generous list of languages,
    # return no tokens.
    if lang not in KEEP_THESE_LANGUAGES:
        return 'xx', []

    # cld2's accuracy seems to improve dramatically with at least 50
    # bytes of input, so throw away non-English below this length.
    if len(text.encode('utf-8')) < 50 and lang != 'en':
        return 'xx', []

    tokens = tokenize(text, lang)
    return lang, tokens
Exemplo n.º 5
0
def tokenize_file(infile,
                  outfile,
                  language,
                  check_language=False,
                  punctuation=False,
                  ftfy=False):
    """
    Take in a file of plain text, tokenize it as the given language, and write
    the result as lines of space-separated tokens.
    """
    for line in infile:
        if ftfy:
            # Run all ftfy fixes, but don't let it introduce line breaks
            line = fix_text(line.rstrip()).replace('\n', ' ')
        else:
            # Run only specific quick fixes from ftfy
            line = fix_surrogates(unescape_html(line.rstrip()))
        tokens = tokenize(line,
                          language,
                          include_punctuation=punctuation,
                          external_wordlist=True)
        checked_lang = None
        if check_language:
            checked_lang, _confidence = detect_language_checked(line.rstrip())
        if (not check_language):
            print(' '.join(tokens), file=outfile)
        else:
            if langcodes.tag_distance(checked_lang, language) < 10:
                print(' '.join(tokens), file=outfile)
Exemplo n.º 6
0
def preprocess_twitter(infile, outfile):
    """
    Read Twitter text from the format we collected it in, and produce language-tagged
    lines.

    In this format, each line might come with some metadata, such as the tweet ID,
    which appears before the text, separated from the text by a tab character. Or it
    might not contain any such data. We weren't very consistent about it over the years.

    This function reads just the text (the part after the tab, if there is a tab). It
    removes URLs and Twitter handles from the text. It then language-detects the
    text, and if it is confident about the language, it outputs a new tab-separated
    file containing the language code and the processed text.

    This format could be read again by the same function, because the language code
    is now the metadata, but we have no reason to actually do this.
    """
    for line in infile:
        if "\t" in line:
            line = line.split("\t", 1)[1]
        text = line.rstrip()
        text = TWITTER_HANDLE_RE.sub("", text)
        text = TCO_RE.sub("", text)
        text = fix_surrogates(unescape_html(text)).replace("\n", " ")
        lang, _confidence = detect_language_checked(text)
        if lang != 'und':
            print(f"{lang}\t{text}", file=outfile)
Exemplo n.º 7
0
    def check_ftfy(self, text, encoding_only=True):
        """
        Given a single text input, check whether `ftfy.fix_text_encoding`
        would change it. If so, display the change.
        """
        self.count += 1
        text = unescape_html(text)
        if not possible_encoding(text, 'ascii'):
            if encoding_only:
                fixed = fix_encoding(text)
            else:
                fixed = fix_text(text,
                                 uncurl_quotes=False,
                                 fix_character_width=False)
            if text != fixed:
                # possibly filter common bots before printing
                print(u'\nText:\t{text!r}\nFixed:\t{fixed!r}\n'.format(
                    text=text, fixed=fixed))
                self.num_fixed += 1

        # Print status updates once in a while
        if self.count % 100 == 0:
            print('.', end='', flush=True)
        if self.count % 10000 == 0:
            print('\n%d/%d fixed' % (self.num_fixed, self.count))
Exemplo n.º 8
0
def test_entities():
    example = '&amp;\n<html>\n&amp;'
    assert fix_text(example) == '&\n<html>\n&amp;'
    assert fix_text_segment(example) == '&amp;\n<html>\n&amp;'

    assert fix_text(example, fix_entities=True) == '&\n<html>\n&'
    assert fix_text_segment(example, fix_entities=True) == '&\n<html>\n&'

    assert fix_text(example, fix_entities=False) == '&amp;\n<html>\n&amp;'
    assert fix_text_segment(example,
                            fix_entities=False) == '&amp;\n<html>\n&amp;'

    assert fix_text_segment('&lt;&gt;', fix_entities=False) == '&lt;&gt;'
    assert fix_text_segment('&lt;&gt;', fix_entities=True) == '<>'
    assert fix_text_segment('&lt;&gt;') == '<>'
    assert fix_text_segment('jednocze&sacute;nie') == 'jednocześnie'
    assert fix_text_segment('JEDNOCZE&Sacute;NIE') == 'JEDNOCZEŚNIE'
    assert fix_text_segment('ellipsis&#133;',
                            normalization='NFKC') == 'ellipsis...'
    assert fix_text_segment('ellipsis&#x85;',
                            normalization='NFKC') == 'ellipsis...'
    assert fix_text_segment('broken&#x81;') == 'broken\x81'
    assert fix_text_segment('&amp;amp;amp;') == '&'
    assert unescape_html('euro &#x80;') == 'euro €'
    assert unescape_html('EURO &EURO;') == 'EURO €'
    assert unescape_html('not an entity &#20x6;') == 'not an entity &#20x6;'
    assert unescape_html('JEDNOCZE&SACUTE;NIE') == 'JEDNOCZEŚNIE'
    assert unescape_html('V&SCARON;ICHNI') == 'VŠICHNI'
    assert unescape_html('&#xffff;') == ''
    assert unescape_html('&#xffffffff;') == '\ufffd'
    assert (fix_text_segment('this is just informal english &not html') ==
            'this is just informal english &not html')
Exemplo n.º 9
0
def cld2_surface_tokenizer(text):
    """
    Uses CLD2 to detect the language and wordfreq tokenizer to create tokens.
    """
    text = unescape_html(text)
    text = TWITTER_HANDLE_RE.sub('', text)
    text = TCO_RE.sub('', text)
    lang = cld2_detect_language(text)
    tokens = tokenize(text, lang)
    return lang, tokens
Exemplo n.º 10
0
def cld2_surface_tokenizer(text):
    """
    Uses CLD2 to detect the language and wordfreq tokenizer to create tokens.
    """
    text = unescape_html(text)
    text = TWITTER_HANDLE_RE.sub('', text)
    text = TCO_RE.sub('', text)
    lang = cld2_detect_language(text)
    tokens = tokenize(text, lang)
    return lang, tokens
Exemplo n.º 11
0
def fix_text_segment(
    text,
    *,
    fix_entities='auto',
    remove_terminal_escapes=True,
    fix_encoding=True,
    fix_latin_ligatures=True,
    fix_character_width=True,
    uncurl_quotes=True,
    fix_line_breaks=True,
    fix_surrogates=True,
    remove_control_chars=True,
    remove_bom=True,
    normalization='NFC'
):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is in a consistent encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_encoding(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if fix_latin_ligatures:
            text = fixes.fix_latin_ligatures(text)
        if fix_character_width:
            text = fixes.fix_character_width(text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom and not remove_control_chars:
            # Skip this step if we've already done `remove_control_chars`,
            # because it would be redundant.
            text = fixes.remove_bom(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if text == origtext:
            return text
Exemplo n.º 12
0
def test_entities():
    example = '&amp;\n<html>\n&amp;'
    assert fix_text(example) == '&\n<html>\n&amp;'
    assert fix_text_segment(example) == '&amp;\n<html>\n&amp;'

    assert fix_text(example, fix_entities=True) == '&\n<html>\n&'
    assert fix_text_segment(example, fix_entities=True) == '&\n<html>\n&'

    assert fix_text(example, fix_entities=False) == '&amp;\n<html>\n&amp;'
    assert fix_text_segment(example, fix_entities=False) == '&amp;\n<html>\n&amp;'

    assert fix_text_segment('&lt;&gt;', fix_entities=False) == '&lt;&gt;'
    assert fix_text_segment('&lt;&gt;', fix_entities=True) == '<>'
    assert fix_text_segment('&lt;&gt;') == '<>'
    assert fix_text_segment('jednocze&sacute;nie') == 'jednocześnie'
    assert fix_text_segment('JEDNOCZE&Sacute;NIE') == 'JEDNOCZEŚNIE'
    assert fix_text_segment('ellipsis&#133;', normalization='NFKC') == 'ellipsis...'
    assert fix_text_segment('ellipsis&#x85;', normalization='NFKC') == 'ellipsis...'
    assert fix_text_segment('broken&#x81;') == 'broken\x81'
    assert unescape_html('euro &#x80;') == 'euro €'
    assert unescape_html('not an entity &#20x6;') == 'not an entity &#20x6;'
Exemplo n.º 13
0
def test_entities():
    example = '&amp;\n<html>\n&amp;'
    eq_(fix_text(example), '&\n<html>\n&amp;')
    eq_(fix_text_segment(example), '&amp;\n<html>\n&amp;')

    eq_(fix_text(example, fix_entities=True), '&\n<html>\n&')
    eq_(fix_text_segment(example, fix_entities=True), '&\n<html>\n&')

    eq_(fix_text(example, fix_entities=False), '&amp;\n<html>\n&amp;')
    eq_(fix_text_segment(example, fix_entities=False), '&amp;\n<html>\n&amp;')

    eq_(fix_text_segment('&lt;&gt;', fix_entities=False), '&lt;&gt;')
    eq_(fix_text_segment('&lt;&gt;', fix_entities=True), '<>')
    eq_(fix_text_segment('&lt;&gt;'), '<>')
    eq_(fix_text_segment('jednocze&sacute;nie'), 'jednocześnie')
    eq_(fix_text_segment('JEDNOCZE&Sacute;NIE'), 'JEDNOCZEŚNIE')
    eq_(fix_text_segment('ellipsis&#133;', normalization='NFKC'), 'ellipsis...')
    eq_(fix_text_segment('ellipsis&#x85;', normalization='NFKC'), 'ellipsis...')
    eq_(fix_text_segment('broken&#x81;'), 'broken\x81')
    eq_(unescape_html('euro &#x80;'), 'euro €')
    eq_(unescape_html('not an entity &#20x6;'), 'not an entity &#20x6;')
Exemplo n.º 14
0
def fix_text_segment(text,
                     fix_entities='auto',
                     remove_terminal_escapes=True,
                     fix_encoding=True,
                     fix_latin_ligatures=True,
                     fix_character_width=True,
                     uncurl_quotes=True,
                     fix_line_breaks=True,
                     fix_surrogates=True,
                     remove_control_chars=True,
                     remove_bom=True,
                     normalization='NFC'):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is in a consistent encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_encoding(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if fix_latin_ligatures:
            text = fixes.fix_latin_ligatures(text)
        if fix_character_width:
            text = fixes.fix_character_width(text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom and not remove_control_chars:
            # Skip this step if we've already done `remove_control_chars`,
            # because it would be redundant.
            text = fixes.remove_bom(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if text == origtext:
            return text
Exemplo n.º 15
0
def cld2_surface_tokenizer(text):
    """
    Uses CLD2 to detect the language and wordfreq tokenizer to create tokens.
    """
    text = unescape_html(text)
    text = TWITTER_HANDLE_RE.sub('', text)
    text = TCO_RE.sub('', text)
    lang = cld2_detect_language(text)

    # Don't allow tokenization in Chinese when language-detecting, because
    # the Chinese tokenizer may not be built yet
    if lang == 'zh':
        lang = 'en'

    tokens = tokenize(text, lang)
    return lang, tokens
Exemplo n.º 16
0
def fix_text_segment(text,
                     remove_unsafe_private_use=False,
                     fix_entities='auto',
                     remove_terminal_escapes=True,
                     fix_encoding=True,
                     normalization='NFKC',
                     uncurl_quotes=True,
                     fix_line_breaks=True,
                     fix_surrogates=True,
                     remove_control_chars=True,
                     remove_bom=True):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is all in the same encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_unsafe_private_use:
            text = fixes.remove_unsafe_private_use(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_text_encoding(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom:
            text = fixes.remove_bom(text)
        if text == origtext:
            return text
Exemplo n.º 17
0
def fix_text_segment(text,
                     remove_unsafe_private_use=False,
                     fix_entities='auto',
                     remove_terminal_escapes=True,
                     fix_encoding=True,
                     normalization='NFKC',
                     uncurl_quotes=True,
                     fix_line_breaks=True,
                     fix_surrogates=True,
                     remove_control_chars=True,
                     remove_bom=True):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is all in the same encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_unsafe_private_use:
            text = fixes.remove_unsafe_private_use(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_text_encoding(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom:
            text = fixes.remove_bom(text)
        if text == origtext:
            return text
Exemplo n.º 18
0
def preprocess_reddit_lines(input_lines):
    for line in input_lines:
        data = json.loads(line)
        if ('score' in data and 'body' in data and data["score"] is not None
                and data["score"] >= 2 and data["body"] != "[deleted]"
                and data["body"] != "[removed]"):
            subreddit = data["subreddit"].casefold()
            subreddit_hash = mmh3.hash(subreddit)
            if subreddit_hash not in BANNED_SUBREDDITS:
                md = fix_surrogates(
                    unescape_html(fix_line_breaks(data["body"])))
                text = strip_markdown(md)
                text = text.replace("\n", " ").replace("\u200b", "")
                text = URL_RE.sub("", text)
                if text:
                    lang, _confidence = detect_language_checked(text)
                    if lang != 'und':
                        # There are more English posts than we need, so filter them
                        # for score >= 3
                        if lang != "en" or data["score"] > 2:
                            yield (lang, text)
Exemplo n.º 19
0
print(fix_text('ünicode'))

print(fix_text('&lt;3'))

print(fix_text("&macr;\\_(ã\x83\x84)_/&macr;"))

len(fix_text(''))

explain_unicode('ノ( º _ ºノ) 테스트')

from ftfy.fixes import fix_encoding, unescape_html, uncurl_quotes, fix_line_breaks, decode_escapes

print(fix_encoding('â\x81”.'))

print(unescape_html('&lt;hr&gt;'))

print(uncurl_quotes('\u201ctest\u201d'))

print(fix_line_breaks("1. hello\u2028" "2. world"))

factoid = '\\u20a2'
print(decode_escapes(factoid))

from ftfy.formatting import character_width, display_center

print(character_width('A'))
print(character_width('가'))

lines = ['Display center', 'center']
for line in lines: