Пример #1
0
def clean(text, uni):
    # If it's a retweet, get rid of the RT text (might as well)
    if 'RT @' in text:
        text = re.sub('RT @\w+?:? ', '', text)
    if 'RT: ' in text:
        text = text.replace('RT: ', '')
    text = text.lower()
    # Now get rid of the URLs starting with http:// (links appear shortened so, no https)
    text = re.sub(r'[ ]*http:\/\/[^ ]+', '', text)
    # Now replace some HTL entities
    replacements = {
        '&lt;': '<',
        '&gt;': '>',
        '&amp;': '&',
        ':': ''
    }
    # Get rid of other ampersand-containing things
    text = re.sub(r'&#\d+;', '', text)
    for key, value in replacements.iteritems():
        text = text.replace(key, value)
    # Now remove all the other non-ascii chars if don't need unicode support
    if not uni:
        text = ''.join(c for c in text if 0 < ord(c) < 128)
    # Get rid of hashtags and mentions too
    text = re.sub(r'[@#]\w+[ ]*', '', text)

    # Get rid of punctuation 
    text = text.translate(None, string.punctuation)

    # Remove stopwords 
    text = preprocessors.filter_by_list(text, preprocessors.stopwordsfile_long)

    # Tabs?
    text = text.replace('\t', '')
    return text.strip(' ')
Пример #2
0
def preprocess(doc, max_word_length=None, min_word_length=None, stopwords="short", stem=False):
    stop = pre.stopwords if stopwords == "short" else pre.stopwords_long
    # Clean
    text = pre.clean(doc.pure_text)
    # Remove stopwords
    text = pre.filter_by_list(text, stop)
    # Remove words of certain length
    if max_word_length and min_word_length:
        text = pre.filter_by_length(text, max_length=max_word_length, min_length=min_word_length)
    else:
        text = pre.filter_by_length(text)
        # Stem
    if stem:
        text = pre.stem(text)
        # Now replace text with processed text
    doc.get_words(text)
Пример #3
0
def clean_up(text):
	text = pre.clean(text)
	text = pre.filter_by_list(text, pre.stopwords_long)
	text = pre.remove_non_ascii(text)
	return text