def pre_process(rdd):
    # Tokenize each line
    # rdd = rdd.map(lambda (sentiment, tweet): (sentiment, word_tokenize(tweet)))
    rdd = rdd.map(lambda (sentiment, tweet):
                  (sentiment, tweet.strip().split(" ")))

    # Lowercase each word in tweet and return and rdd equivlant to (0,['this', 'is', 'a', 'lowercased','tweet'])
    rdd = rdd.map(lambda (sentiment, tweet):
                  (sentiment, lower_case(tweet=tweet)))

    # Remove punctuation from a tweet.
    # Example :(0,["is,","so","sad","for","my","APL","friend","............."]) should be mapped to
    #          (0,["is","so","sad","for","my","APL","friend"])
    rdd = rdd.map(lambda (sentiment, tweet):
                  (sentiment, remove_punc_keep_emoj(tweet=tweet)))
    # Stem words to their original. For example: "missing" or "missed" -> "miss"
    rdd = rdd.map(lambda (sentiment, tweet):
                  (sentiment, [stem_words(word=word) for word in tweet]))
    # Remove stop words such as: a, I, and, all, once, etc.
    rdd = rdd.map(lambda (sentiment, tweet): (
        sentiment, [word for word in tweet if word not in STOPWORDS]))

    # Map elongated words with a shorter version, with only 3 letters of the repeated words
    # Example: cooooollllll is mapped to cooolll
    rdd = rdd.map(lambda (sentiment, tweet):
                  (sentiment, [pytypo.cut_repeat(word, 3) for word in tweet]))
    return rdd
Пример #2
0
def test_cut_repeat():
    assert_equal(pytypo.cut_repeat('pytypooooooo', 1), 'pytypo')
    assert_equal(pytypo.cut_repeat('beeeeer', 2), 'beer')