Exemplo n.º 1
0
def text_cleaner(text,
                 deep_clean=False,
                 stem=True,
                 stop_words=True,
                 translite_rate=True):
    rules = [
        {
            r'>\s+': u'>'
        },  # remove spaces after a tag opens or closes
        {
            r'\s+': u' '
        },  # replace consecutive spaces
        {
            r'\s*<br\s*/?>\s*': u'\n'
        },  # newline after a <br>
        {
            r'</(div)\s*>\s*': u'\n'
        },  # newline after </p> and </div> and <h1/>...
        {
            r'</(p|h\d)\s*>\s*': u'\n\n'
        },  # newline after </p> and </div> and <h1/>...
        {
            r'<head>.*<\s*(/head|body)[^>]*>': u''
        },  # remove <head> to </head>
        {
            r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1'
        },  # show links instead of texts
        {
            r'[ \t]*<[^<]*?/?>': u''
        },  # remove remaining tags
        {
            r'^\s+': u''
        }  # remove spaces at the beginning
    ]

    if deep_clean:
        text = text.replace(".", "")
        text = text.replace("[", " ")
        text = text.replace(",", " ")
        text = text.replace("]", " ")
        text = text.replace("(", " ")
        text = text.replace(")", " ")
        text = text.replace("\"", "")
        text = text.replace("-", " ")
        text = text.replace("=", " ")
        text = text.replace("?", " ")
        text = text.replace("!", " ")

        for rule in rules:
            for (k, v) in rule.items():
                regex = re.compile(k)
                text = regex.sub(v, text)
            text = text.rstrip()
            text = text.strip()
        text = text.replace('+',
                            ' ').replace('.',
                                         ' ').replace(',',
                                                      ' ').replace(':', ' ')
        text = re.sub("(^|\W)\d+($|\W)", " ", text)
        if translite_rate:
            text = transliterate(text)
        if stem:
            text = PorterStemmer().stem(text)
        text = WordNetLemmatizer().lemmatize(text)
        if stop_words:
            stop_words = set(stopwords.words('english'))
            word_tokens = word_tokenize(text)
            text = [w for w in word_tokens if not w in stop_words]
            text = ' '.join(str(e) for e in text)
    else:
        for rule in rules:
            for (k, v) in rule.items():
                regex = re.compile(k)
                text = regex.sub(v, text)
            text = text.rstrip()
            text = text.strip()
    return text.lower()
Exemplo n.º 2
0
def processEmail(email_contents):
    vocabList = getVocabList()
    word_indices = []

    # ========================== Preprocess Email ===========================

    # Find the Headers ( \n\n and remove )
    # Uncomment the following lines if you are working with raw emails with the
    # full headers

    # hdrstart = email_contents.find("\n\n")
    # if hdrstart:
    #     email_contents = email_contents[hdrstart:]
    
    # Lower case
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    email_contents = re.sub('<[^<>]+>', ' ', email_contents)

    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = re.sub('[0-9]+', 'number', email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents)

    # Handle $ sign
    email_contents = re.sub('[$]+', 'dollar', email_contents)

    # Tokenize and get rid of any punctuation
#    [str, email_contents] = ...
#       strtok(email_contents, ...
#              [' @$/#.-:&*+=[]?!(){},''">_<;%' char(10) char(13)]);
    email_contents = re.split(r'[@$/#.-:&\*\+=\[\]?!(){},\'\'\">_<;%\s]+', email_contents)
#    print(email_contents)

    # Output the email to screen as well
    #print('\n==== Processed Email ====\n\n')
    # Process file
    l = 0
    for token in email_contents:
        # Remove any non alphanumeric characters
        token = re.sub('[^a-zA-Z0-9]', '', token)
        # Stem the word 
        token = PorterStemmer().stem(token.strip())
        # Skip the word if it is too short
        if len(token) < 1:
           continue
        idx = vocabList[token] if token in vocabList else 0
        # only add entries which are in vocabList
        #   i.e. those with ind ~= 0, 
        #        given that ind is assigned 0 if str is not found in vocabList
        if idx > 0:
            word_indices.append(idx)
        # Print to screen, ensuring that the output lines are not too long
        if l + len(token) + 1 > 78:
            print("")
            l = 0
        print(token)
        l = l + len(token) + 1

    # Print footer
    #print('\n\n=========================\n')
    
    return word_indices