def subtlex_other_deps(dirname_in, languages): lines = [] for language in languages: input_file = '{prefix}/subtlex.{lang}.txt'.format(prefix=dirname_in, lang=language) processed_file = wordlist_filename('subtlex-other', language, 'processed.txt') output_file = wordlist_filename('subtlex-other', language, 'counts.txt') textcol, freqcol = SUBTLEX_COLUMN_MAP[language] if language == 'zh': step2_file = wordlist_filename('subtlex-other', 'zh-Hans', 'converted.txt') add_dep(lines, 'simplify_chinese', input_file, step2_file) else: step2_file = input_file # Skip one header line by setting 'startrow' to 2 (because tail is 1-based). # I hope we don't need to configure this by language anymore. add_dep(lines, 'convert_subtlex', step2_file, processed_file, params={ 'textcol': textcol, 'freqcol': freqcol, 'startrow': 2 }) add_dep(lines, 'merge_counts', processed_file, output_file, params={'cutoff': 0}) return lines
def subtlex_en_deps(dirname_in, languages): lines = [] # Either subtlex_en is turned off, or it's just in English if not languages: return lines assert languages == ['en'] regions = ['en-US', 'en-GB'] processed_files = [] for region in regions: input_file = '{prefix}/subtlex.{region}.txt'.format( prefix=dirname_in, region=region ) textcol, freqcol = SUBTLEX_COLUMN_MAP['en'] processed_file = wordlist_filename('subtlex-en', region, 'processed.txt') processed_files.append(processed_file) add_dep( lines, 'convert_subtlex', input_file, processed_file, params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2} ) output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt') add_dep( lines, 'merge_counts', processed_files, output_file, params={'cutoff': 0} ) return lines
def twitter_deps(input_filename, slice_prefix, combined_prefix, slices, languages): lines = [] slice_files = ["{prefix}.part{num:0>2d}".format(prefix=slice_prefix, num=num) for num in range(slices)] # split the input into slices add_dep( lines, "split", input_filename, slice_files, params={"prefix": "{}.part".format(slice_prefix), "slices": slices} ) for slicenum in range(slices): slice_file = slice_files[slicenum] language_outputs = ["{prefix}.{lang}.txt".format(prefix=slice_file, lang=language) for language in languages] add_dep(lines, "tokenize_twitter", slice_file, language_outputs, params={"prefix": slice_file}) for language in languages: combined_output = wordlist_filename("twitter", language, "tokens.txt") language_inputs = [ "{prefix}.{lang}.txt".format(prefix=slice_files[slicenum], lang=language) for slicenum in range(slices) ] add_dep(lines, "cat", language_inputs, combined_output) count_file = wordlist_filename("twitter", language, "counts.txt") if language == "ja": mecab_token_file = wordlist_filename("twitter", language, "mecab-tokens.txt") add_dep(lines, "tokenize_japanese", combined_output, mecab_token_file) combined_output = mecab_token_file add_dep(lines, "count", combined_output, count_file, extra="wordfreq_builder/tokenizers.py") return lines
def subtlex_other_deps(dirname_in, languages): lines = [] for language in languages: input_file = '{prefix}/subtlex.{lang}.txt'.format( prefix=dirname_in, lang=language ) processed_file = wordlist_filename('subtlex-other', language, 'processed.txt') output_file = wordlist_filename('subtlex-other', language, 'counts.txt') textcol, freqcol = SUBTLEX_COLUMN_MAP[language] if language == 'zh': step2_file = wordlist_filename('subtlex-other', 'zh-Hans', 'converted.txt') add_dep(lines, 'simplify_chinese', input_file, step2_file) else: step2_file = input_file # Skip one header line by setting 'startrow' to 2 (because tail is 1-based). # I hope we don't need to configure this by language anymore. add_dep( lines, 'convert_subtlex', step2_file, processed_file, params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2} ) add_dep( lines, 'merge_counts', processed_file, output_file ) return lines
def subtlex_en_deps(dirname_in, languages): lines = [] # Either subtlex_en is turned off, or it's just in English if not languages: return lines assert languages == ['en'] regions = ['en-US', 'en-GB'] processed_files = [] for region in regions: input_file = '{prefix}/subtlex.{region}.txt'.format(prefix=dirname_in, region=region) textcol, freqcol = SUBTLEX_COLUMN_MAP['en'] processed_file = wordlist_filename('subtlex-en', region, 'processed.txt') processed_files.append(processed_file) add_dep(lines, 'convert_subtlex', input_file, processed_file, params={ 'textcol': textcol, 'freqcol': freqcol, 'startrow': 2 }) output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt') add_dep(lines, 'merge_counts', processed_files, output_file, params={'cutoff': 0}) return lines
def twitter_deps(input_filename, slice_prefix, combined_prefix, slices, languages): lines = [] slice_files = [ '{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num) for num in range(slices) ] # split the input into slices add_dep(lines, 'split', input_filename, slice_files, params={ 'prefix': '{}.part'.format(slice_prefix), 'slices': slices }) for slicenum in range(slices): slice_file = slice_files[slicenum] language_outputs = [ '{prefix}.{lang}.txt'.format(prefix=slice_file, lang=language) for language in languages ] add_dep(lines, 'tokenize_twitter', slice_file, language_outputs, params={'prefix': slice_file}) for language in languages: combined_output = wordlist_filename('twitter', language, 'tokens.txt') language_inputs = [ '{prefix}.{lang}.txt'.format(prefix=slice_files[slicenum], lang=language) for slicenum in range(slices) ] add_dep(lines, 'cat', language_inputs, combined_output) count_file = wordlist_filename('twitter', language, 'counts.txt') if language == 'ja': mecab_token_file = wordlist_filename('twitter', language, 'mecab-tokens.txt') add_dep(lines, 'tokenize_japanese', combined_output, mecab_token_file) combined_output = mecab_token_file add_dep(lines, 'count', combined_output, count_file, extra='wordfreq_builder/tokenizers.py') return lines
def jieba_deps(dirname_in, languages): lines = [] # Because there's Chinese-specific handling here, the valid options for # 'languages' are [] and ['zh']. Make sure it's one of those. if not languages: return lines assert languages == ['zh'] input_file = '{prefix}/dict.txt.big'.format(prefix=dirname_in) transformed_file = wordlist_filename('jieba', 'zh-Hans', 'converted.txt') reformatted_file = wordlist_filename('jieba', 'zh', 'counts.txt') add_dep(lines, 'simplify_chinese', input_file, transformed_file) add_dep(lines, 'convert_jieba', transformed_file, reformatted_file) return lines
def leeds_deps(dirname_in, languages): lines = [] for language in languages: input_file = '{prefix}/internet-{lang}-forms.num'.format( prefix=dirname_in, lang=language) if language == 'zh': step2_file = wordlist_filename('leeds', 'zh-Hans', 'converted.txt') add_dep(lines, 'simplify_chinese', input_file, step2_file) else: step2_file = input_file reformatted_file = wordlist_filename('leeds', language, 'counts.txt') add_dep(lines, 'convert_leeds', step2_file, reformatted_file) return lines
def opensubtitles_deps(dirname_in, languages): lines = [] for language in languages: input_file = '{prefix}/{lang}.txt'.format(prefix=dirname_in, lang=language) if language == 'zh': step2_file = wordlist_filename('opensubtitles', 'zh-Hans', 'converted.txt') add_dep(lines, 'simplify_chinese', input_file, step2_file) else: step2_file = input_file reformatted_file = wordlist_filename('opensubtitles', language, 'counts.txt') add_dep(lines, 'convert_opensubtitles', step2_file, reformatted_file) return lines
def leeds_deps(dirname_in, languages): lines = [] for language in languages: input_file = '{prefix}/internet-{lang}-forms.num'.format( prefix=dirname_in, lang=language ) if language == 'zh': step2_file = wordlist_filename('leeds', 'zh-Hans', 'converted.txt') add_dep(lines, 'simplify_chinese', input_file, step2_file) else: step2_file = input_file reformatted_file = wordlist_filename('leeds', language, 'counts.txt') add_dep(lines, 'convert_leeds', step2_file, reformatted_file) return lines
def twitter_deps(input_filename, slice_prefix, combined_prefix, slices, languages): lines = [] slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num) for num in range(slices)] # split the input into slices add_dep(lines, 'split', input_filename, slice_files, params={'prefix': '{}.part'.format(slice_prefix), 'slices': slices}) for slicenum in range(slices): slice_file = slice_files[slicenum] language_outputs = [ '{prefix}.{lang}.txt'.format(prefix=slice_file, lang=language) for language in languages ] add_dep(lines, 'tokenize_twitter', slice_file, language_outputs, params={'prefix': slice_file}, extra='wordfreq_builder/tokenizers.py') for language in languages: combined_output = wordlist_filename('twitter', language, 'tokens.txt') language_inputs = [ '{prefix}.{lang}.txt'.format( prefix=slice_files[slicenum], lang=language ) for slicenum in range(slices) ] add_dep(lines, 'cat', language_inputs, combined_output) count_file = wordlist_filename('twitter', language, 'counts.txt') if language == 'ja': mecab_token_file = wordlist_filename( 'twitter', language, 'mecab-tokens.txt') add_dep( lines, 'tokenize_japanese', combined_output, mecab_token_file) combined_output = mecab_token_file add_dep(lines, 'count', combined_output, count_file, extra='wordfreq_builder/tokenizers.py') return lines
def opensubtitles_deps(dirname_in, languages): lines = [] for language in languages: input_file = "{prefix}/{lang}.txt".format(prefix=dirname_in, lang=language) reformatted_file = wordlist_filename("opensubtitles", language, "counts.txt") add_dep(lines, "convert_opensubtitles", input_file, reformatted_file) return lines
def leeds_deps(dirname_in, languages): lines = [] for language in languages: input_file = "{prefix}/internet-{lang}-forms.num".format(prefix=dirname_in, lang=language) reformatted_file = wordlist_filename("leeds", language, "counts.txt") add_dep(lines, "convert_leeds", input_file, reformatted_file) return lines
def jieba_deps(dirname_in, languages): lines = [] # Because there's Chinese-specific handling here, the valid options for # 'languages' are [] and ['zh']. Make sure it's one of those. if not languages: return lines assert languages == ['zh'] input_file = '{prefix}/dict.txt.big'.format(prefix=dirname_in) transformed_file = wordlist_filename( 'jieba', 'zh-Hans', 'converted.txt' ) reformatted_file = wordlist_filename( 'jieba', 'zh', 'counts.txt' ) add_dep(lines, 'simplify_chinese', input_file, transformed_file) add_dep(lines, 'convert_jieba', transformed_file, reformatted_file) return lines
def opensubtitles_deps(dirname_in, languages): lines = [] for language in languages: input_file = '{prefix}/{lang}.txt'.format( prefix=dirname_in, lang=language ) if language == 'zh': step2_file = wordlist_filename('opensubtitles', 'zh-Hans', 'converted.txt') add_dep(lines, 'simplify_chinese', input_file, step2_file) else: step2_file = input_file reformatted_file = wordlist_filename( 'opensubtitles', language, 'counts.txt' ) add_dep(lines, 'convert_opensubtitles', step2_file, reformatted_file) return lines
def reddit_deps(dirname_in, languages): lines = [] path_in = pathlib.Path(dirname_in) slices = {} counts_by_language = defaultdict(list) # Extract text from the Reddit comment dumps, and write them to # .txt.gz files for filepath in path_in.glob('*/*.bz2'): base = filepath.stem transformed_file = wordlist_filename('reddit', base + '.all', 'txt') slices[base] = transformed_file add_dep(lines, 'extract_reddit', str(filepath), transformed_file) for base in sorted(slices): transformed_file = slices[base] language_outputs = [] for language in languages: filename = wordlist_filename('reddit', base + '.' + language, 'txt') language_outputs.append(filename) count_filename = wordlist_filename('reddit', base + '.' + language, 'counts.txt') add_dep(lines, 'count', filename, count_filename) counts_by_language[language].append(count_filename) # find the prefix by constructing a filename, then stripping off # '.xx.txt' from the end prefix = wordlist_filename('reddit', base + '.xx', 'txt')[:-7] add_dep(lines, 'tokenize_reddit', transformed_file, language_outputs, params={'prefix': prefix}, extra='wordfreq_builder/tokenizers.py') for language in languages: output_file = wordlist_filename('reddit', language, 'counts.txt') add_dep(lines, 'merge_counts', counts_by_language[language], output_file, params={'cutoff': 3}) return lines
def leeds_deps(dirname_in, languages): lines = [] for language in languages: input_file = '{prefix}/internet-{lang}-forms.num'.format( prefix=dirname_in, lang=language) reformatted_file = wordlist_filename('leeds', language, 'counts.txt') add_dep(lines, 'convert_leeds', input_file, reformatted_file) return lines
def wikipedia_deps(dirname_in, languages): lines = [] path_in = pathlib.Path(dirname_in) for language in languages: # Find the most recent file for this language input_file = max(path_in.glob("{}wiki*.bz2".format(language))) plain_text_file = wordlist_filename("wikipedia", language, "txt") count_file = wordlist_filename("wikipedia", language, "counts.txt") add_dep(lines, "wiki2text", input_file, plain_text_file) if language == "ja": mecab_token_file = wordlist_filename("wikipedia", language, "mecab-tokens.txt") add_dep(lines, "tokenize_japanese", plain_text_file, mecab_token_file) add_dep(lines, "count", mecab_token_file, count_file) else: add_dep(lines, "count", plain_text_file, count_file) return lines
def opensubtitles_deps(dirname_in, languages): lines = [] for language in languages: input_file = '{prefix}/{lang}.txt'.format(prefix=dirname_in, lang=language) reformatted_file = wordlist_filename('opensubtitles', language, 'counts.txt') add_dep(lines, 'convert_opensubtitles', input_file, reformatted_file) return lines
def wikipedia_deps(dirname_in, languages): lines = [] path_in = pathlib.Path(dirname_in) for language in languages: # Find the most recent file for this language input_file = max(path_in.glob('{}wiki*.bz2'.format(language))) plain_text_file = wordlist_filename('wikipedia', language, 'txt') count_file = wordlist_filename('wikipedia', language, 'counts.txt') add_dep(lines, 'wiki2text', input_file, plain_text_file) if language == 'ja': mecab_token_file = wordlist_filename('wikipedia', language, 'mecab-tokens.txt') add_dep(lines, 'tokenize_japanese', plain_text_file, mecab_token_file) add_dep(lines, 'count', mecab_token_file, count_file) else: add_dep(lines, 'count', plain_text_file, count_file) return lines
def google_books_deps(dirname_in): # Get English data from the split-up files of the Google Syntactic N-grams # 2013 corpus. lines = [] # Yes, the files are numbered 00 through 98 of 99. This is not an # off-by-one error. Not on my part, anyway. input_files = ["{}/nodes.{:>02d}-of-99.gz".format(dirname_in, i) for i in range(99)] output_file = wordlist_filename("google-books", "en", "counts.txt") add_dep(lines, "convert_google_syntactic_ngrams", input_files, output_file) return lines
def combine_lists(languages): lines = [] for language in languages: sources = source_names(language) input_files = [ wordlist_filename(source, language, 'counts.txt') for source in sources ] output_file = wordlist_filename('combined', language) add_dep(lines, 'merge', input_files, output_file, extra='wordfreq_builder/word_counts.py') output_cBpack = wordlist_filename('combined-dist', language, 'msgpack.gz') add_dep(lines, 'freqs2cB', output_file, output_cBpack, extra='wordfreq_builder/word_counts.py', params={'lang': language}) lines.append('default {}'.format(output_cBpack)) # Write standalone lists for Twitter frequency if language in CONFIG['sources']['twitter']: input_file = wordlist_filename('twitter', language, 'counts.txt') output_cBpack = wordlist_filename('twitter-dist', language, 'msgpack.gz') add_dep(lines, 'freqs2cB', input_file, output_cBpack, extra='wordfreq_builder/word_counts.py', params={'lang': language}) lines.append('default {}'.format(output_cBpack)) return lines
def wikipedia_deps(dirname_in, languages): lines = [] path_in = pathlib.Path(dirname_in) for language in languages: # Find the most recent file for this language input_file = max(path_in.glob('{}wiki*.bz2'.format(language))) plain_text_file = wordlist_filename('wikipedia', language, 'txt') count_file = wordlist_filename('wikipedia', language, 'counts.txt') add_dep(lines, 'wiki2text', input_file, plain_text_file) if language == 'ja': mecab_token_file = wordlist_filename( 'wikipedia', language, 'mecab-tokens.txt' ) add_dep( lines, 'tokenize_japanese', plain_text_file, mecab_token_file ) add_dep(lines, 'count', mecab_token_file, count_file) else: add_dep(lines, 'count', plain_text_file, count_file) return lines
def reddit_deps(dirname_in, languages): lines = [] if not languages: return lines assert languages == ['en'] processed_files = [] path_in = pathlib.Path(dirname_in) for filepath in path_in.glob('*/*.bz2'): base = filepath.name[:-4] transformed_file = wordlist_filename('reddit', 'en', base + '.txt.gz') add_dep(lines, 'extract_reddit', str(filepath), transformed_file) count_file = wordlist_filename('reddit', 'en', base + '.counts.txt') add_dep(lines, 'count', transformed_file, count_file) processed_files.append(count_file) output_file = wordlist_filename('reddit', 'en', 'counts.txt') add_dep( lines, 'merge_counts', processed_files, output_file, params={'cutoff': 3} ) return lines
def google_books_deps(dirname_in): # Get English data from the split-up files of the Google Syntactic N-grams # 2013 corpus. lines = [] # Yes, the files are numbered 00 through 98 of 99. This is not an # off-by-one error. Not on my part, anyway. input_files = [ '{}/nodes.{:>02d}-of-99.gz'.format(dirname_in, i) for i in range(99) ] output_file = wordlist_filename('google-books', 'en', 'counts.txt') add_dep(lines, 'convert_google_syntactic_ngrams', input_files, output_file) return lines
def reddit_deps(dirname_in, languages): lines = [] path_in = pathlib.Path(dirname_in) slices = {} counts_by_language = defaultdict(list) # Extract text from the Reddit comment dumps, and write them to # .txt.gz files for filepath in path_in.glob('*/*.bz2'): base = filepath.stem transformed_file = wordlist_filename('reddit', base + '.all', 'txt') slices[base] = transformed_file add_dep(lines, 'extract_reddit', str(filepath), transformed_file) for base in sorted(slices): transformed_file = slices[base] language_outputs = [] for language in languages: filename = wordlist_filename('reddit', base + '.' + language, 'txt') language_outputs.append(filename) count_filename = wordlist_filename('reddit', base + '.' + language, 'counts.txt') add_dep(lines, 'count', filename, count_filename) counts_by_language[language].append(count_filename) # find the prefix by constructing a filename, then stripping off # '.xx.txt' from the end prefix = wordlist_filename('reddit', base + '.xx', 'txt')[:-7] add_dep(lines, 'tokenize_reddit', transformed_file, language_outputs, params={'prefix': prefix}, extra='wordfreq_builder/tokenizers.py') for language in languages: output_file = wordlist_filename('reddit', language, 'counts.txt') add_dep( lines, 'merge_counts', counts_by_language[language], output_file, params={'cutoff': 3} ) return lines
def combine_lists(languages): lines = [] for language in languages: sources = source_names(language) input_files = [ wordlist_filename(source, language, 'counts.txt') for source in sources ] output_file = wordlist_filename('combined', language) add_dep(lines, 'merge', input_files, output_file, extra='wordfreq_builder/word_counts.py', params={'cutoff': 2, 'lang': language}) output_cBpack = wordlist_filename( 'combined-dist', language, 'msgpack.gz' ) output_cBpack_big = wordlist_filename( 'combined-dist-large', language, 'msgpack.gz' ) add_dep(lines, 'freqs2cB', output_file, output_cBpack, extra='wordfreq_builder/word_counts.py', params={'lang': language, 'buckets': 600}) add_dep(lines, 'freqs2cB', output_file, output_cBpack_big, extra='wordfreq_builder/word_counts.py', params={'lang': language, 'buckets': 800}) lines.append('default {}'.format(output_cBpack)) if language in CONFIG['big-lists']: lines.append('default {}'.format(output_cBpack_big)) # Write standalone lists for Twitter frequency if language in CONFIG['sources']['twitter']: input_file = wordlist_filename('twitter', language, 'counts.txt') output_cBpack = wordlist_filename( 'twitter-dist', language, 'msgpack.gz') add_dep(lines, 'freqs2cB', input_file, output_cBpack, extra='wordfreq_builder/word_counts.py', params={'lang': language, 'buckets': 600}) lines.append('default {}'.format(output_cBpack)) # Write a Jieba-compatible frequency file for Chinese tokenization chinese_combined = wordlist_filename('combined', 'zh') jieba_output = wordlist_filename('jieba-dist', 'zh') add_dep(lines, 'counts_to_jieba', chinese_combined, jieba_output, extra=['wordfreq_builder/word_counts.py', 'wordfreq_builder/cli/counts_to_jieba.py']) lines.append('default {}'.format(jieba_output)) return lines
def combine_lists(languages): lines = [] for language in languages: sources = source_names(language) input_files = [wordlist_filename(source, language, "counts.txt") for source in sources] output_file = wordlist_filename("combined", language) add_dep(lines, "merge", input_files, output_file, extra="wordfreq_builder/word_counts.py") output_cBpack = wordlist_filename("combined-dist", language, "msgpack.gz") add_dep( lines, "freqs2cB", output_file, output_cBpack, extra="wordfreq_builder/word_counts.py", params={"lang": language}, ) lines.append("default {}".format(output_cBpack)) # Write standalone lists for Twitter frequency if language in CONFIG["sources"]["twitter"]: input_file = wordlist_filename("twitter", language, "counts.txt") output_cBpack = wordlist_filename("twitter-dist", language, "msgpack.gz") add_dep( lines, "freqs2cB", input_file, output_cBpack, extra="wordfreq_builder/word_counts.py", params={"lang": language}, ) lines.append("default {}".format(output_cBpack)) return lines
def combine_lists(languages): lines = [] for language in languages: sources = source_names(language) input_files = [ wordlist_filename(source, language, 'counts.txt') for source in sources ] output_file = wordlist_filename('combined', language) add_dep(lines, 'merge', input_files, output_file, extra='wordfreq_builder/word_counts.py', params={ 'cutoff': 2, 'lang': language }) output_cBpack = wordlist_filename('combined-dist', language, 'msgpack.gz') output_cBpack_big = wordlist_filename('combined-dist-large', language, 'msgpack.gz') add_dep(lines, 'freqs2cB', output_file, output_cBpack, extra='wordfreq_builder/word_counts.py', params={ 'lang': language, 'buckets': 600 }) add_dep(lines, 'freqs2cB', output_file, output_cBpack_big, extra='wordfreq_builder/word_counts.py', params={ 'lang': language, 'buckets': 800 }) lines.append('default {}'.format(output_cBpack)) if language in CONFIG['big-lists']: lines.append('default {}'.format(output_cBpack_big)) # Write standalone lists for Twitter frequency if language in CONFIG['sources']['twitter']: input_file = wordlist_filename('twitter', language, 'counts.txt') output_cBpack = wordlist_filename('twitter-dist', language, 'msgpack.gz') add_dep(lines, 'freqs2cB', input_file, output_cBpack, extra='wordfreq_builder/word_counts.py', params={ 'lang': language, 'buckets': 600 }) lines.append('default {}'.format(output_cBpack)) # Write a Jieba-compatible frequency file for Chinese tokenization chinese_combined = wordlist_filename('combined', 'zh') jieba_output = wordlist_filename('jieba-dist', 'zh') add_dep(lines, 'counts_to_jieba', chinese_combined, jieba_output, extra=[ 'wordfreq_builder/word_counts.py', 'wordfreq_builder/cli/counts_to_jieba.py' ]) lines.append('default {}'.format(jieba_output)) return lines