def get_wordnet_connected_words(word): words = [] for entry in get_wordnet_entries(word): words.extend(tokenize(entry['definition'], 'en')) words.extend(entry['related']) for example in entry['examples']: words.extend(tokenize(example, 'en')) return words
def tokenize_by_language(in_file, out_dir, zipped=False, languages=FT_LANGUAGES): """ Take in language-tagged text, and use wordfreq to tokenize it. """ if zipped: out_files = { language: gzip.open('%s/%s.txt.gz' % (out_dir, language), 'wt', encoding='utf-8') for language in languages } else: out_files = { language: open('%s/%s.txt' % (out_dir, language), 'w', encoding='utf-8') for language in languages } try: for line in in_file: lang, text = line.rstrip().split('\t', 1) if lang in languages: tokenized = tokenize(text, lang, include_punctuation=True, external_wordlist=True) out_file = out_files[lang] print(' '.join(tokenized), file=out_file) finally: for out_file in out_files.values(): out_file.close()
def tokenize_file(infile, outfile, language, check_language=False, punctuation=False, ftfy=False): """ Take in a file of plain text, tokenize it as the given language, and write the result as lines of space-separated tokens. """ for line in infile: if ftfy: # Run all ftfy fixes, but don't let it introduce line breaks line = fix_text(line.rstrip()).replace('\n', ' ') else: # Run only specific quick fixes from ftfy line = fix_surrogates(unescape_html(line.rstrip())) tokens = tokenize(line, language, include_punctuation=punctuation, external_wordlist=True) checked_lang = None if check_language: checked_lang, _confidence = detect_language_checked(line.rstrip()) if (not check_language): print(' '.join(tokens), file=outfile) else: if langcodes.tag_distance(checked_lang, language) < 10: print(' '.join(tokens), file=outfile)
def _word_frequency(word, lang, wordlist, minimum): tokens = tokenize(word, lang) if not tokens: return minimum # Frequencies for multiple tokens are combined using the formula # 1 / f = 1 / f1 + 1 / f2 + ... # Thus the resulting frequency is less than any individual frequency, and # the smallest frequency dominates the sum. freqs = get_frequency_dict(lang, wordlist) one_over_result = 0.0 for token in tokens: if token not in freqs: # If any word is missing, just return the default value return minimum one_over_result += 1.0 / freqs[token] return max(1.0 / one_over_result, minimum)
def _word_frequency(word, lang, wordlist, minimum): tokens = tokenize(word, lang, combine_numbers=True) if not tokens: return minimum # Frequencies for multiple tokens are combined using the formula # 1 / f = 1 / f1 + 1 / f2 + ... # Thus the resulting frequency is less than any individual frequency, and # the smallest frequency dominates the sum. freqs = get_frequency_dict(lang, wordlist) one_over_result = 0.0 for token in tokens: if token not in freqs: # If any word is missing, just return the default value return minimum one_over_result += 1.0 / freqs[token] freq = 1.0 / one_over_result if lang in INFERRED_SPACE_LANGUAGES: freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1) return max(freq, minimum)