def clean_tweets(tweet_series):
    cleaned_tweets = []
    for tweet in tweet_series:
        raw_post = redditcleaner.clean(tweet)
        post_text = BeautifulSoup(unescape(raw_post), 'lxml').get_text()
        alpha_characters_only = re.sub("[^a-zA-Z]+", " ", post_text)
        cleaned_tweets.append(alpha_characters_only)
    return cleaned_tweets
Пример #2
0
def posts_to_words(raw_post):
    raw_post = redditcleaner.clean(raw_post)
    post_text = BeautifulSoup(unescape(raw_post)).get_text()
    alpha_characters_only = re.sub("[^a-zA-Z]+", " ", post_text)
    alpha_characters_only = alpha_characters_only.replace("&", ' ').replace(
        "#x200B", ' ').replace('[', ' ').replace(']', ' ')

    words = alpha_characters_only.lower().split()
    return words
Пример #3
0
def humanReadablePost(
        redditRawText):  #Makes body and selftext not an abomination.
    cleaned = redditcleaner.clean(
        redditRawText).split()  #Makes reddit's text formatting readable

    #Spliting post string into sets of 15 words so the output is readable when it reaches it's place within json.
    splitWords = []
    temp = []
    for i, word in enumerate(cleaned):
        temp.append(word)
        if i % 15 == 0 and i != 0:
            splitWords.append(temp)
            temp = []

    #Another way of saying if the number of totalW words % 15 != 0. Need to put the leftover words where they belong
    if len(temp) != 0:
        splitWords.append(temp)

    return [' '.join(cleanPost) for cleanPost in splitWords
            ]  #1D list with each item containing a max of 15 words.
Пример #4
0
def textPostWords(
        redditRawText):  # Makes body and selftext not an abomination.
    return redditcleaner.clean(redditRawText).split()
    '''