def pre_process(rdd): # Tokenize each line # rdd = rdd.map(lambda (sentiment, tweet): (sentiment, word_tokenize(tweet))) rdd = rdd.map(lambda (sentiment, tweet): (sentiment, tweet.strip().split(" "))) # Lowercase each word in tweet and return and rdd equivlant to (0,['this', 'is', 'a', 'lowercased','tweet']) rdd = rdd.map(lambda (sentiment, tweet): (sentiment, lower_case(tweet=tweet))) # Remove punctuation from a tweet. # Example :(0,["is,","so","sad","for","my","APL","friend","............."]) should be mapped to # (0,["is","so","sad","for","my","APL","friend"]) rdd = rdd.map(lambda (sentiment, tweet): (sentiment, remove_punc_keep_emoj(tweet=tweet))) # Stem words to their original. For example: "missing" or "missed" -> "miss" rdd = rdd.map(lambda (sentiment, tweet): (sentiment, [stem_words(word=word) for word in tweet])) # Remove stop words such as: a, I, and, all, once, etc. rdd = rdd.map(lambda (sentiment, tweet): ( sentiment, [word for word in tweet if word not in STOPWORDS])) # Map elongated words with a shorter version, with only 3 letters of the repeated words # Example: cooooollllll is mapped to cooolll rdd = rdd.map(lambda (sentiment, tweet): (sentiment, [pytypo.cut_repeat(word, 3) for word in tweet])) return rdd
def test_cut_repeat(): assert_equal(pytypo.cut_repeat('pytypooooooo', 1), 'pytypo') assert_equal(pytypo.cut_repeat('beeeeer', 2), 'beer')