示例#1
0
def wc_mapper(document):
    """ generator that yields (word,1) for each word in document. 1
    indicates word presence """
    # note we are looking at the occurence of distinct words from each
    # document, not the overall occurence of words. For example document 1
    # may use the word 'it' ten times but this will be counted only once for
    # document 1. The text seems to miss this point.
    for word in tokenize(document):
           yield (word, 1)
示例#2
0
def words_per_user_mapper(status_update):
    """ yields (username, (word, 1)) tuple """
    # note the tokenize function forms a set of distinct words, so here we
    # are getting the most popular words across status updates. This is a
    # choice we could have looked for the most popular word among all the
    # words from every update.
    user = status_update['username']
    for word in tokenize(status_update['text']):
        yield (user, (word, 1))
示例#3
0
def word_count(documents):
    """ basic word counting by looping through each document and counting
    words. """
    return Counter(word for document in documents 
                   for word in tokenize(document))