Exemplo n.º 1
0
def get_wordnet_connected_words(word):
    words = []
    for entry in get_wordnet_entries(word):
        words.extend(tokenize(entry['definition'], 'en'))
        words.extend(entry['related'])
        for example in entry['examples']:
            words.extend(tokenize(example, 'en'))
    return words
Exemplo n.º 2
0
def tokenize_by_language(in_file,
                         out_dir,
                         zipped=False,
                         languages=FT_LANGUAGES):
    """
    Take in language-tagged text, and use wordfreq to tokenize it.
    """
    if zipped:
        out_files = {
            language: gzip.open('%s/%s.txt.gz' % (out_dir, language),
                                'wt',
                                encoding='utf-8')
            for language in languages
        }
    else:
        out_files = {
            language: open('%s/%s.txt' % (out_dir, language),
                           'w',
                           encoding='utf-8')
            for language in languages
        }
    try:
        for line in in_file:
            lang, text = line.rstrip().split('\t', 1)
            if lang in languages:
                tokenized = tokenize(text,
                                     lang,
                                     include_punctuation=True,
                                     external_wordlist=True)
                out_file = out_files[lang]
                print(' '.join(tokenized), file=out_file)
    finally:
        for out_file in out_files.values():
            out_file.close()
Exemplo n.º 3
0
def tokenize_file(infile,
                  outfile,
                  language,
                  check_language=False,
                  punctuation=False,
                  ftfy=False):
    """
    Take in a file of plain text, tokenize it as the given language, and write
    the result as lines of space-separated tokens.
    """
    for line in infile:
        if ftfy:
            # Run all ftfy fixes, but don't let it introduce line breaks
            line = fix_text(line.rstrip()).replace('\n', ' ')
        else:
            # Run only specific quick fixes from ftfy
            line = fix_surrogates(unescape_html(line.rstrip()))
        tokens = tokenize(line,
                          language,
                          include_punctuation=punctuation,
                          external_wordlist=True)
        checked_lang = None
        if check_language:
            checked_lang, _confidence = detect_language_checked(line.rstrip())
        if (not check_language):
            print(' '.join(tokens), file=outfile)
        else:
            if langcodes.tag_distance(checked_lang, language) < 10:
                print(' '.join(tokens), file=outfile)
Exemplo n.º 4
0
def _word_frequency(word, lang, wordlist, minimum):
    tokens = tokenize(word, lang)
    if not tokens:
        return minimum

    # Frequencies for multiple tokens are combined using the formula
    #     1 / f = 1 / f1 + 1 / f2 + ...
    # Thus the resulting frequency is less than any individual frequency, and
    # the smallest frequency dominates the sum.
    freqs = get_frequency_dict(lang, wordlist)
    one_over_result = 0.0
    for token in tokens:
        if token not in freqs:
            # If any word is missing, just return the default value
            return minimum
        one_over_result += 1.0 / freqs[token]

    return max(1.0 / one_over_result, minimum)
Exemplo n.º 5
0
def _word_frequency(word, lang, wordlist, minimum):
    tokens = tokenize(word, lang, combine_numbers=True)
    if not tokens:
        return minimum

    # Frequencies for multiple tokens are combined using the formula
    #     1 / f = 1 / f1 + 1 / f2 + ...
    # Thus the resulting frequency is less than any individual frequency, and
    # the smallest frequency dominates the sum.
    freqs = get_frequency_dict(lang, wordlist)
    one_over_result = 0.0
    for token in tokens:
        if token not in freqs:
            # If any word is missing, just return the default value
            return minimum
        one_over_result += 1.0 / freqs[token]

    freq = 1.0 / one_over_result

    if lang in INFERRED_SPACE_LANGUAGES:
        freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1)

    return max(freq, minimum)