def clean(text, uni): # If it's a retweet, get rid of the RT text (might as well) if 'RT @' in text: text = re.sub('RT @\w+?:? ', '', text) if 'RT: ' in text: text = text.replace('RT: ', '') text = text.lower() # Now get rid of the URLs starting with http:// (links appear shortened so, no https) text = re.sub(r'[ ]*http:\/\/[^ ]+', '', text) # Now replace some HTL entities replacements = { '<': '<', '>': '>', '&': '&', ':': '' } # Get rid of other ampersand-containing things text = re.sub(r'&#\d+;', '', text) for key, value in replacements.iteritems(): text = text.replace(key, value) # Now remove all the other non-ascii chars if don't need unicode support if not uni: text = ''.join(c for c in text if 0 < ord(c) < 128) # Get rid of hashtags and mentions too text = re.sub(r'[@#]\w+[ ]*', '', text) # Get rid of punctuation text = text.translate(None, string.punctuation) # Remove stopwords text = preprocessors.filter_by_list(text, preprocessors.stopwordsfile_long) # Tabs? text = text.replace('\t', '') return text.strip(' ')
def preprocess(doc, max_word_length=None, min_word_length=None, stopwords="short", stem=False): stop = pre.stopwords if stopwords == "short" else pre.stopwords_long # Clean text = pre.clean(doc.pure_text) # Remove stopwords text = pre.filter_by_list(text, stop) # Remove words of certain length if max_word_length and min_word_length: text = pre.filter_by_length(text, max_length=max_word_length, min_length=min_word_length) else: text = pre.filter_by_length(text) # Stem if stem: text = pre.stem(text) # Now replace text with processed text doc.get_words(text)
def clean_up(text): text = pre.clean(text) text = pre.filter_by_list(text, pre.stopwords_long) text = pre.remove_non_ascii(text) return text