def emit(word_it): for word in word_it: yield vd(int, { word : 1 , 'begin_with' : vd(int, { word[0] : 1 }) , 'has_size' : vd(int, { len(word) : 1 } ) })
def word_count( unicode_file ): exclude = set(string.punctuation) def clean(exlcude): def _clean(word): return ''.join(ch for ch in word if ch not in exclude) return _clean sp_pattern = re.compile( """[\.\!\"\s\-\,\']+""", re.M) res = vd( int, {}) for line in iter(open(unicode_file ) ): for word in map( clean(exclude), map( str.lower, sp_pattern.split(line )) ): if len(word) > 2 : res += vd(int, { word : 1 , 'begin_with' : vd(int, { word[0] : 1 }) , 'has_size' : vd(int, { len(word) : 1 } ) }) return res