def clean_tweets(tweet_series): cleaned_tweets = [] for tweet in tweet_series: raw_post = redditcleaner.clean(tweet) post_text = BeautifulSoup(unescape(raw_post), 'lxml').get_text() alpha_characters_only = re.sub("[^a-zA-Z]+", " ", post_text) cleaned_tweets.append(alpha_characters_only) return cleaned_tweets
def posts_to_words(raw_post): raw_post = redditcleaner.clean(raw_post) post_text = BeautifulSoup(unescape(raw_post)).get_text() alpha_characters_only = re.sub("[^a-zA-Z]+", " ", post_text) alpha_characters_only = alpha_characters_only.replace("&", ' ').replace( "#x200B", ' ').replace('[', ' ').replace(']', ' ') words = alpha_characters_only.lower().split() return words
def humanReadablePost( redditRawText): #Makes body and selftext not an abomination. cleaned = redditcleaner.clean( redditRawText).split() #Makes reddit's text formatting readable #Spliting post string into sets of 15 words so the output is readable when it reaches it's place within json. splitWords = [] temp = [] for i, word in enumerate(cleaned): temp.append(word) if i % 15 == 0 and i != 0: splitWords.append(temp) temp = [] #Another way of saying if the number of totalW words % 15 != 0. Need to put the leftover words where they belong if len(temp) != 0: splitWords.append(temp) return [' '.join(cleanPost) for cleanPost in splitWords ] #1D list with each item containing a max of 15 words.
def textPostWords( redditRawText): # Makes body and selftext not an abomination. return redditcleaner.clean(redditRawText).split() '''