def merge_lists(input_names, output_name): count_dicts = [] for input_name in input_names: values, total = read_values(input_name, cutoff=0) count_dicts.append(values) merged = merge_counts(count_dicts) write_wordlist(merged, output_name)
def merge_lists(input_names, output_name, cutoff=0, max_words=1000000): count_dicts = [] for input_name in input_names: values, total = read_values(input_name, cutoff=cutoff, max_words=max_words) count_dicts.append(values) merged = merge_counts(count_dicts) write_wordlist(merged, output_name)
def merge_lists(input_names, output_name, cutoff, lang): freq_dicts = [] # Don't use Chinese tokenization while building wordlists, as that would # create a circular dependency. if lang == 'zh': lang = None for input_name in input_names: freq_dicts.append(read_freqs(input_name, cutoff=cutoff, lang=lang)) merged = merge_freqs(freq_dicts) write_wordlist(merged, output_name)
def merge_lists(input_names, output_name): freq_dicts = [] for input_name in input_names: freq_dicts.append(read_freqs(input_name, cutoff=2)) merged = merge_freqs(freq_dicts) write_wordlist(merged, output_name)
def handle_counts(filename_in, filename_out): counts = count_tokens(filename_in) write_wordlist(counts, filename_out)