def parse(lang_sample): """tally word popularity using novel extracts, etc""" words = words_from_archive(lang_sample, include_dups=True) counts = zero_default_dict() for word in words: counts[word] += 1 return set(words), counts
def parse(lang_sample, file_format='bz'): from autocorrect.utils import words_from_archive, words_from_txt, \ zero_default_dict """tally word popularity using novel extracts, etc""" if file_format == 'bz': words = words_from_archive(lang_sample, include_dups=True) elif file_format == 'txt': words = words_from_txt(lang_sample) counts = zero_default_dict() for word in words: counts[word] += 1 return set(words), counts
from autocorrect.utils import words_from_archive # en_US_GB_CA is a superset of US, GB and CA # spellings (color, colour, etc). It contains # roughly half a million words. For this # example, imagine it's just seven words... # # we (lower) # flew (lower) # to (lower) # Abu (mixed) # Dhabi (mixed) # via (lower) # Colombo (mixed) LOWERCASE = words_from_archive('en_US_GB_CA_lower.txt') # {'we', 'flew', 'to', 'via'} CASE_MAPPED = words_from_archive('en_US_GB_CA_mixed.txt', map_case=True) # {abu': 'Abu', # 'dhabi': 'Dhabi', # 'colombo': 'Colombo'} # # Note that en_US_GB_CA_mixed.txt also contains # acronyms/mixed case variants of common words, # so in reality, CASE_MAPPED also contains: # # {'to': 'TO', # 'via': 'Via'}