def wc_mapper(document): """ generator that yields (word,1) for each word in document. 1 indicates word presence """ # note we are looking at the occurence of distinct words from each # document, not the overall occurence of words. For example document 1 # may use the word 'it' ten times but this will be counted only once for # document 1. The text seems to miss this point. for word in tokenize(document): yield (word, 1)
def words_per_user_mapper(status_update): """ yields (username, (word, 1)) tuple """ # note the tokenize function forms a set of distinct words, so here we # are getting the most popular words across status updates. This is a # choice we could have looked for the most popular word among all the # words from every update. user = status_update['username'] for word in tokenize(status_update['text']): yield (user, (word, 1))
def word_count(documents): """ basic word counting by looping through each document and counting words. """ return Counter(word for document in documents for word in tokenize(document))