def processEmail(email_contents): vocabList = getVocabList() word_indices = [] # 处理邮件内容 email_contents = email_contents.lower() email_contents, _ = re.subn(r'<[^<>]+>', ' ', email_contents) email_contents, _ = re.subn(r'[0-9]+', 'number', email_contents) email_contents, _ = re.subn(r'(http|https)://[^\s]*', 'httpaddr', email_contents) email_contents, _ = re.subn(r'[^\s]+@[^\s]+', 'emailaddr', email_contents) email_contents, _ = re.subn(r'[$]+', 'dollar', email_contents) # print email_contents # 还需要提取单词主干,把双数,ing等去掉, 使用nltk模块即可 from nltk.stem import PorterStemmer if email_contents != '': re_words = re.findall(r'[A-Za-z]+', email_contents) # print re_words for word in re_words: word = PorterStemmer().stem(word) # word, _ = re.subn(r'[^a-zA-Z0-9]', '', word) # print word for i in range(len(vocabList)): if vocabList[i] == word: word_indices.append(i) # print len(word_indices) # print word_indices return word_indices
def processEmail(email_contents): vocab_list = getVocabList() print(len(vocab_list)) # process input email_contents = email_contents.lower() # email_contents = email_contents.replace("<[^<>]+>', ' ", "") # email_contents = re.sub("[+>', ']", "", email_contents) # handle numbers email_contents = re.sub("[0-9]+", "number", email_contents) # handle URL email_contents = re.sub("(http|https)://[^\s]*", "httpaddr", email_contents) email_contents = re.sub("[^\s]+@[^\s]+", "emailaddr", email_contents) email_contents = re.sub("[$]+", "dollar", email_contents) # token email # remove punctuations # @$ / # .-:&*+=[]?!(){},'' l = re.compile(r'@$ / # .-:&*\+=\[]?!(){},' '').split(email_contents) print(l) print(len(l)) email_contents = re.sub(r' @$ / # .-:&*\+=\[]?!(){},' '', " ", email_contents) print(email_contents)
def ProcessEmail(email): # Stripping email email = email.lower() # re.sub is used to REPLACE a matched character email = re.sub( '(http|https)://[^\s]*', 'httpaddr', email) # [\s] matches whitespace// [^\s] matches non-white spaces email = re.sub('[^\s]+@[^\s]+', 'emailaddr', email) # matches any character with @ in-between email = re.sub('[<>?,.:/]+', ' ', email) # matches characters inside [] email = re.sub('[0-9]+', 'number', email) # matches numbers from 0-9 email = re.sub('[$]+', 'dollar ', email) # matches $ email = re.sub('[\s]+', ' ', email) # matches whitespaces \n,\t,\..... print("Processed e-mail :\n\n", email) ###################### # Tokenizing Processed email tokens = email.split() # split email to individual tokens or words stemmer = nltk.PorterStemmer() # defining stemmer for use ###################### word_index = [ ] # [] defines a list or normal array # {} defines a dictionary or associative array vocab_dict = getVocabList( ) # dictionary holds key:value, pairs...refer website # Indexing email corrosponding to vocab_dict for token in tokens: token = stemmer.stem( token ) # stemming # use token.strip() to be safe....here the email is already stripped from all possible characters if token in vocab_dict: # indexing email with Vocubulary dictionary word_index.append( int(vocab_dict[token] )) # use append to add element to empty list/array # using int is VERY IMPORTANT. else it will store as characters eg. '86','916'.... return word_index, vocab_dict
p = svmPredict(model, Xtest) print('Test Accuracy: %f\n'%(np.mean(np.double(p == ytest)) * 100)) input('Program paused. Press enter to continue.\n') ## ================= Part 5: Top Predictors of Spam ==================== # Since the model we are training is a linear SVM, we can inspect the # weights learned by the model to understand better how it is determining # whether an email is spam or not. The following code finds the words with # the highest weights in the classifier. Informally, the classifier # 'thinks' that these words are the most likely indicators of spam. # # Sort the weights and obtin the vocabulary list idx = np.argsort(-model['w'], axis=0) vocabList = getVocabList() print('\nTop predictors of spam: \n') for i in range(15): print(' %-15s (%f) \n'%(vocabList[idx[i][0]], model['w'][idx[i][0]])) print('\n\n') print('\nProgram paused. Press enter to continue.\n') input('Program paused. Press enter to continue.\n') ## =================== Part 6: Try Your Own Emails ===================== # Now that you've trained the spam classifier, you can use it on your own # emails! In the starter code, we have included spamSample1.txt, # spamSample2.txt, emailSample1.txt and emailSample2.txt as examples. # The following code reads in one of these emails and then uses your # learned SVM classifier to determine whether the email is Spam or
def processEmail(email_contents): # Load Vocabulary vocabList = getVocabList() # Init return value word_indices = [] # ========================== Preprocess Email =========================== # Find the Headers ( \n\n and remove ) # Uncomment the following lines if you are working with raw emails with the # full headers # Lower case email_contents = email_contents.lower() # Strip all HTML # Looks for any expression that starts with < and ends with > and replace # and does not have any < or > in the tag it with a space email_contents = re.sub(r"<[^<>]+>", " ", email_contents) # Handle Numbers # Look for one or more characters between 0-9 email_contents = re.sub(r"[0-9]+", "number", email_contents) # Handle URLS # Look for strings starting with http:// or https:// email_contents = re.sub(r"(http|https)://[^\s]*", "httpaddr", email_contents) # Handle Email Addresses # Look for strings with @ in the middle email_contents = re.sub(r"[^\s]+@[^\s]+", "emailaddr", email_contents) # Handle $ sign email_contents = re.sub(r"[$]+", "dollar", email_contents) # ========================== Tokenize Email =========================== # Output the email to screen as well print('\n==== Processed Email ====\n\n') # Process file l = 0 strs = re.split(r'[ `\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>?\n\t]', email_contents) p = PorterStemmer() for _str in strs: # Remove any non alphanumeric characters _str = re.sub(r"[^a-zA-Z0-9]", "", _str) # Stem the word # (the porterStemmer sometimes has issues, so we use a try catch block) _str = p.stem(_str) # Skip the word if it is too short if len(_str) < 1: continue for i in range(len(vocabList)): if _str == vocabList[i]: word_indices.append(i) break if l + len(_str) + 1 > 78: print('') l = 0 print(_str + " ", end='') # Print footer print('\n\n=========================\n') return word_indices
## ================= Part 5: Top Predictors of Spam ==================== # Since the model we are training is a linear SVM, we can inspect the # weights learned by the model to understand better how it is determining # whether an email is spam or not. The following code finds the words with # the highest weights in the classifier. Informally, the classifier # 'thinks' that these words are the most likely indicators of spam. # # Sort the weights and obtain the vocabulary list w = model.coef_[0] # from http://stackoverflow.com/a/16486305/583834 # reverse sorting by index indices = w.argsort()[::-1][:15] vocabList = sorted(gvl.getVocabList().keys()) print('\nTop predictors of spam: \n'); for idx in indices: print( ' {:s} ({:f}) '.format( vocabList[idx], float(w[idx]) ) ) raw_input('Program paused. Press enter to continue.') ## =================== Part 6: Try Your Own Emails ===================== # Now that you've trained the spam classifier, you can use it on your own # emails! In the starter code, we have included spamSample1.txt, # spamSample2.txt, emailSample1.txt and emailSample2.txt as examples. # The following code reads in one of these emails and then uses your # learned SVM classifier to determine whether the email is Spam or # Not Spam
## ================= Part 5: Top Predictors of Spam ==================== # Since the model we are training is a linear SVM, we can inspect the # weights learned by the model to understand better how it is determining # whether an email is spam or not. The following code finds the words with # the highest weights in the classifier. Informally, the classifier # 'thinks' that these words are the most likely indicators of spam. # # Sort the weights and obtain the vocabulary list w = model.coef_[0] # from http://stackoverflow.com/a/16486305/583834 # reverse sorting by index indices = w.argsort()[::-1][:15] vocabList = sorted(getVocabList().keys()) print('\nTop predictors of spam: \n') for idx in indices: print(' {:s} ({:f}) '.format(vocabList[idx], float(w[idx]))) input('Program paused. Press <Enter> to continue...') ## =================== Part 6: Try Your Own Emails ===================== # Now that you've trained the spam classifier, you can use it on your own # emails! In the starter code, we have included spamSample1.txt, # spamSample2.txt, emailSample1.txt and emailSample2.txt as examples. # The following code reads in one of these emails and then uses your # learned SVM classifier to determine whether the email is Spam or # Not Spam
def processEmail(email_contents): # Load Vocabulary vocabList = getVocabList() # Init return value word_indices = [] # ========================== Preprocess Email =========================== # Lower case email_contents = email_contents.lower() # Strip all HTML # Looks for any expression that starts with < and ends with > and replace # and does not have any < or > in the tag it with a space rx = re.compile('<[^<>]+>|\n') email_contents = rx.sub(' ', email_contents) # Handle Numbers # Look for one or more characters between 0-9 rx = re.compile('[0-9]+') email_contents = rx.sub('number', email_contents) # Handle URLS # Look for strings starting with http:// or https:// rx = re.compile('(http|https)://[^\s]*') email_contents = rx.sub('httpaddr', email_contents) # Handle Email Addresses # Look for strings with @ in the middle rx = re.compile('[^\s]+@[^\s]+') email_contents = rx.sub('emailaddr', email_contents) # Handle $ sign rx = re.compile('[$]+') email_contents = rx.sub('dollar', email_contents) # ========================== Tokenize Email =========================== # Output the email to screen as well print('\n=== Processed Email ====') # Process file l = 0 # Remove any non alphanumeric characters rx = re.compile('[^a-zA-Z0-9 ]') email_contents = rx.sub('', email_contents).split() print(email_contents) for word in email_contents: # Stem the word # (the porterStemmer sometimes has issues, so we use a try catch block) try: word = porterStemmer(word.strip()) except: word = '' continue # Skip the word if it is too short if len(word) < 1: continue # Look up the word in the dictionary and add to word_indices if # found if word in vocabList: word_indices.append(vocabList.index(word)) # Print to screen, ensuring that the output lines are not too long if l + len(word) + l > 78: print(word) l = 0 else: print(word, end=' ') l = l + len(word) + 1 print('\n=========================\n') return word_indices
def processEmail(email_contents): """preprocesses a the body of an email and returns a list of word_indices word_indices = PROCESSEMAIL(email_contents) preprocesses the body of an email and returns a list of indices of the words contained in the email. """ # Load Vocabulary vocabList = getVocabList() # Init return value word_indices = [] # ========================== Preprocess Email =========================== # Find the Headers ( \n\n and remove ) # Uncomment the following lines if you are working with raw emails with the # full headers # hdrstart = strfind(email_contents, ([chr(10) chr(10)])) # email_contents = email_contents(hdrstart(1):end) # Lower case email_contents = lower(email_contents) # Strip all HTML # Looks for any expression that starts with < and ends with > and replace # and does not have any < or > in the tag it with a space rx = re.compile('<[^<>]+>|\n') email_contents = rx.sub(' ', email_contents) # Handle Numbers # Look for one or more characters between 0-9 rx = re.compile('[0-9]+') email_contents = rx.sub('number ', email_contents) # Handle URLS # Look for strings starting with http:// or https:// rx = re.compile('(http|https)://[^\s]*') email_contents = rx.sub('httpaddr ', email_contents) # Handle Email Addresses # Look for strings with @ in the middle rx = re.compile('[^\s]+@[^\s]+') email_contents = rx.sub('emailaddr ', email_contents) # Handle $ sign rx = re.compile('[$]+') email_contents = rx.sub('dollar ', email_contents) # ========================== Tokenize Email =========================== # Output the email to screen as well print '==== Processed Email ====\n' # Process file l = 0 # Remove any non alphanumeric characters rx = re.compile('[^a-zA-Z0-9 ]') email_contents = rx.sub('', email_contents).split() for str in email_contents: # Tokenize and also get rid of any punctuation # str = re.split('[' + re.escape(' @$/#.-:&*+=[]?!(){},''">_<#') # + chr(10) + chr(13) + ']', str) # Stem the word # (the porterStemmer sometimes has issues, so we use a try catch block) try: str = porterStemmer(str.strip()) except: str = '' continue # Skip the word if it is too short if len(str) < 1: continue # Look up the word in the dictionary and add to word_indices if # found # ====================== YOUR CODE HERE ====================== # Instructions: Fill in this function to add the index of str to # word_indices if it is in the vocabulary. At this point # of the code, you have a stemmed word from the email in # the variable str. You should look up str in the # vocabulary list (vocabList). If a match exists, you # should add the index of the word to the word_indices # vector. Concretely, if str = 'action', then you should # look up the vocabulary list to find where in vocabList # 'action' appears. For example, if vocabList{18} = # 'action', then, you should add 18 to the word_indices # vector (e.g., word_indices = [word_indices 18] ). # # Note: vocabList{idx} returns a the word with index idx in the # vocabulary list. # # Note: You can use strcmp(str1, str2) to compare two strings (str1 and # str2). It will return 1 only if the two strings are equivalent. # # ============================================================= # Print to screen, ensuring that the output lines are not too long if (l + len(str) + 1) > 78: print str l = 0 else: print str, l = l + len(str) + 1 # Print footer print '\n=========================' return word_indices
def ex6_spam(): ## Machine Learning Online Class # Exercise 6 | Spam Classification with SVMs # # Instructions # ------------ # # This file contains code that helps you get started on the # exercise. You will need to complete the following functions: # # gaussianKernel.m # dataset3Params.m # processEmail.m # emailFeatures.m # # For this exercise, you will not need to change any code in this file, # or any other files other than those mentioned above. # ## Initialization #clear ; close all; clc ## ==================== Part 1: Email Preprocessing ==================== # To use an SVM to classify emails into Spam v.s. Non-Spam, you first need # to convert each email into a vector of features. In this part, you will # implement the preprocessing steps for each email. You should # complete the code in processEmail.m to produce a word indices vector # for a given email. print('\nPreprocessing sample email (emailSample1.txt)') # Extract Features file_contents = readFile('emailSample1.txt') word_indices = processEmail(file_contents) # Print Stats print('Word Indices: ') print(formatter(' %d', np.array(word_indices) + 1)) print('\n') print('Program paused. Press enter to continue.') #pause; ## ==================== Part 2: Feature Extraction ==================== # Now, you will convert each email into a vector of features in R^n. # You should complete the code in emailFeatures.m to produce a feature # vector for a given email. print('\nExtracting features from sample email (emailSample1.txt)') # Extract Features file_contents = readFile('emailSample1.txt') word_indices = processEmail(file_contents) features = emailFeatures(word_indices) # Print Stats print('Length of feature vector: %d' % features.size) print('Number of non-zero entries: %d' % np.sum(features > 0)) print('Program paused. Press enter to continue.') #pause; ## =========== Part 3: Train Linear SVM for Spam Classification ======== # In this section, you will train a linear classifier to determine if an # email is Spam or Not-Spam. # Load the Spam Email dataset # You will have X, y in your environment mat = scipy.io.loadmat('spamTrain.mat') X = mat['X'].astype(float) y = mat['y'][:, 0] print('\nTraining Linear SVM (Spam Classification)\n') print('(this may take 1 to 2 minutes) ...\n') C = 0.1 model = svmTrain(X, y, C, linearKernel) p = svmPredict(model, X) print('Training Accuracy: %f' % (np.mean(p == y) * 100)) ## =================== Part 4: Test Spam Classification ================ # After training the classifier, we can evaluate it on a test set. We have # included a test set in spamTest.mat # Load the test dataset # You will have Xtest, ytest in your environment mat = scipy.io.loadmat('spamTest.mat') Xtest = mat['Xtest'].astype(float) ytest = mat['ytest'][:, 0] print('\nEvaluating the trained Linear SVM on a test set ...\n') p = svmPredict(model, Xtest) print('Test Accuracy: %f\n' % (np.mean(p == ytest) * 100)) #pause; ## ================= Part 5: Top Predictors of Spam ==================== # Since the model we are training is a linear SVM, we can inspect the # weights learned by the model to understand better how it is determining # whether an email is spam or not. The following code finds the words with # the highest weights in the classifier. Informally, the classifier # 'thinks' that these words are the most likely indicators of spam. # # Sort the weights and obtin the vocabulary list idx = np.argsort(model['w']) top_idx = idx[-15:][::-1] vocabList = getVocabList() print('\nTop predictors of spam: ') for word, w in zip(np.array(vocabList)[top_idx], model['w'][top_idx]): print(' %-15s (%f)' % (word, w)) #end print('\n') print('\nProgram paused. Press enter to continue.') #pause; ## =================== Part 6: Try Your Own Emails ===================== # Now that you've trained the spam classifier, you can use it on your own # emails! In the starter code, we have included spamSample1.txt, # spamSample2.txt, emailSample1.txt and emailSample2.txt as examples. # The following code reads in one of these emails and then uses your # learned SVM classifier to determine whether the email is Spam or # Not Spam # Set the file to be read in (change this to spamSample2.txt, # emailSample1.txt or emailSample2.txt to see different predictions on # different emails types). Try your own emails as well! filename = 'spamSample1.txt' # Read and predict file_contents = readFile(filename) word_indices = processEmail(file_contents) x = emailFeatures(word_indices) p = svmPredict(model, x.ravel()) print('\nProcessed %s\n\nSpam Classification: %d' % (filename, p)) print('(1 indicates spam, 0 indicates not spam)\n')
p = model.predict(X) print('Training Accuracy: ', np.mean(np.double(p == y.ravel())) * 100) # =================== Part 4: Test Spam Classification ================ data = loadmat("data/spamTest.mat") Xtest = data['Xtest'] ytest = data['ytest'] p = model.predict(Xtest) print('Test Accuracy: ', np.mean(np.double(p == ytest.ravel())) * 100) # ================= Part 5: Top Predictors of Spam ==================== w = model.coef_[0] idx = np.argsort(w)[::-1][:15] vocabList = list(getVocabList().keys()) print('Top predictors of spam: ') for i in idx: print("{:15s} {:.3f}".format(vocabList[i], w[i])) # =================== Part 6: Try Your Own Emails ===================== with open('data/emailSample1.txt', 'r') as file: file_contents = file.read() file.close() word_indices = processEmail(file_contents) x = emailFeatures(word_indices) p = model.predict(x.T) print('Spam Classification: ', p) print('(1 indicates spam, 0 indicates not spam)')
def processEmail(email_contents): #PROCESSEMAIL preprocesses a the body of an email and #returns a list of word_indices # word_indices = PROCESSEMAIL(email_contents) preprocesses # the body of an email and returns a list of indices of the # words contained in the email. # # Load Vocabulary vocabList = gvl.getVocabList() # Init return value word_indices = [] # ========================== Preprocess Email =========================== # Find the Headers ( \n\n and remove ) # Uncomment the following lines if you are working with raw emails with the # full headers # hdrstart = email_contents.find("\n\n") # if hdrstart: # email_contents = email_contents[hdrstart:] # Lower case email_contents = email_contents.lower() # Strip all HTML # Looks for any expression that starts with < and ends with > and replace # and does not have any < or > in the tag it with a space email_contents = re.sub('<[^<>]+>', ' ', email_contents) # Handle Numbers # Look for one or more characters between 0-9 email_contents = re.sub('[0-9]+', 'number', email_contents) # Handle URLS # Look for strings starting with http:// or https:// email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents) # Handle Email Addresses # Look for strings with @ in the middle email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents) # Handle $ sign email_contents = re.sub('[$]+', 'dollar', email_contents) # ========================== Tokenize Email =========================== # Output the email to screen as well print('\n==== Processed Email ====\n\n') # Process file l = 0 # Slightly different order from matlab version # Split and also get rid of any punctuation # regex may need further debugging... email_contents = re.split(r'[@$/#.-:&\*\+=\[\]?!(){},\'\'\">_<;%\s\n\r\t]+', email_contents) for token in email_contents: # Remove any non alphanumeric characters token = re.sub('[^a-zA-Z0-9]', '', token) # Stem the word token = PorterStemmer().stem_word(token.strip()) # Skip the word if it is too short if len(token) < 1: continue # Look up the word in the dictionary and add to word_indices if # found # ====================== YOUR CODE HERE ====================== # Instructions: Fill in this function to add the index of str to # word_indices if it is in the vocabulary. At this point # of the code, you have a stemmed word from the email in # the variable str. You should look up str in the # vocabulary list (vocabList). If a match exists, you # should add the index of the word to the word_indices # vector. Concretely, if str = 'action', then you should # look up the vocabulary list to find where in vocabList # 'action' appears. For example, if vocabList{18} = # 'action', then, you should add 18 to the word_indices # vector (e.g., word_indices = [word_indices ; 18]; ). # # Note: vocabList{idx} returns a the word with index idx in the # vocabulary list. # # Note: You can use strcmp(str1, str2) to compare two strings (str1 and # str2). It will return 1 only if the two strings are equivalent. # idx = vocabList[token] if token in vocabList else 0 # only add entries which are in vocabList # i.e. those with ind ~= 0, # given that ind is assigned 0 if str is not found in vocabList if idx > 0: word_indices.append(idx) # ============================================================= # Print to screen, ensuring that the output lines are not too long if l + len(token) + 1 > 78: print("") l = 0 print('{:s}'.format(token)), l = l + len(token) + 1 # Print footer print('\n\n=========================\n') return word_indices
def processEmail(email_contents = "") : """ processes the email body to return the email in its root form """ vocabList = getVocabList.getVocabList() word_indices = [] # ========================== Preprocess Email =========================== # Find the Headers ( \n\n and remove ) # Uncomment the following lines if you are working with raw emails with # the full headers email_contents = str(email_contents) # Lower case email_contents = email_contents.lower() # Strip all HTML # Looks for any expression that starts with < and ends with > and replace # and does not have any < or > in the tag it with a space email_contents = re.sub('<[^<>]+>',' ', email_contents) # Handle Numbers # Look for one or more characters between 0-9 # email_contents = regexprep(email_contents, '[0-9]+', 'number'); email_contents = re.sub('[0-9]+','number', email_contents) # Handle URLS # Look for strings starting with http:// or https:// email_contents = re.sub('(http|https)://[^\s]*','httpaddr', email_contents) # Handle Email Addresses # Look for strings with @ in the middle email_contents = re.sub('[^\s]+@[^\s]+','emailaddr', email_contents) # Handle $ sign email_contents = re.sub('[$]+','dollar', email_contents) # ========================== Tokenize Email =========================== # Output the email to screen as well delimiters = ' ' , '@' ,' $', '|', '/', '#', '.', '-' ,':' ,'&', '*', '+', '=', '[', ']', '?',\ '!', '(', ')', '{', '}', ',' , "'", '"', '>', '_' ,'<', '|', ';' ,'%' , "\n", "\t" regexPattern = '|'.join(map(re.escape, delimiters)) dic = re.split(regexPattern, email_contents) for i in range (len(dic)) : if len(dic[i]) > 0 : dic[i] = re.sub('[^a-zA-Z0-9]', '', dic[i]) dic[i] = stem(dic[i]) for j in range (len(vocabList)): if dic[i] == vocabList[j] : word_indices.append(j) return word_indices
def main() : path = os.getcwd() path = os.path.join(path,'dataSets') # =============== Part 1 ==================== # To use an SVM to classify emails into Spam v.s. Non-Spam, we first need # to convert each email into a vector of features. In this part, we # implement the preprocessing steps for each email. f = open(os.path.join(path,"emailSample1.txt"),'r') email_contents = f.read() f.close() print(email_contents) word_indices = processEmail.processEmail(email_contents) features = emailFeatures(word_indices) print('Word Indices :\n') print(word_indices, "\n") #============= Part 2 ======================= # Print Stats print('Length of feature vector: %d\n' % len(features)) print('Number of non-zero entries: %d\n' % sum(features)) # ============= Part 3 ====================== # In this section, we will train a linear classifier to determine if an # email is Spam or Not-Spam. print('\n\nRunning SVM on training set...') mat = io.loadmat(os.path.join(path,'spamTrain.mat')) X = mat['X'] y = mat['y'] y = numpy.ravel(y) model = svm.SVC(C = 0.1, kernel='linear') model.fit(X, y) p = model.predict(X) accuracy = model.score(X, y) accuracy *= 100.0 print('\nTraining Accuracy: %.2f' % accuracy) #================ Part 4 ======================== # Xtest and ytest are the env. variables mat = io.loadmat(os.path.join(path,'spamTest.mat')) XTest = mat['Xtest'] yTest = mat['ytest'] yTest = numpy.ravel(yTest) p = model.predict(XTest) accuracy = model.score(XTest,yTest) accuracy *= 100.0 print('\nTest Accuracy: %.2f' % accuracy) #================ Part 5 ============================ # Since the model we are training is a linear SVM, we can inspect the # weights learned by the model to understand better how it is determining # whether an email is spam or not. The following code finds the words with # the highest weights in the classifier. Informally, the classifier # 'thinks' that these words are the most likely indicators of spam. print('\nTop spam predictors (keywords) \n') z = model.coef_ z = numpy.ravel(z) vocabList = getVocabList.getVocabList() dic = {} for i in range(len(z)) : dic[ vocabList[i] ] = z[i] cnt = 0 for w in sorted(dic, key=dic.get, reverse=True): if cnt == 15 : break cnt = cnt + 1 print('{0:10} - {1:10f}'.format(w, dic[w])) print('\n\n') # ============ Part 6: Test a sample Email ===================== # Now that we have trained the spam classifier, we can use it on our own # emails! # The following code reads in one of these emails and then uses our # learned SVM classifier to determine whether the email is Spam or # Not Spam f = open(os.path.join(path, "spamSample1.txt"),'r') email_contents = f.read() f.close() print('Sample Email : ') print(email_contents) word_indices = processEmail.processEmail(email_contents) features = emailFeatures(word_indices) X = emailFeatures(word_indices); p = model.predict(X) print('\nEmail Processed\n\nSpam Classification: %d\n' % p); print('(1 indicates spam, 0 indicates not spam)\n\n');
def processEmail(email_contents): ''' word_indices = PROCESSEMAIL(email_contents) preprocesses the body of an email and returns a list of indices of the words contained in the email. ''' from getVocabList import getVocabList import re from nltk.stem.porter import PorterStemmer # Load Vocabulary vocabList = getVocabList() # ========================== Preprocess Email =========================== # Lower case email_contents = email_contents.lower() # Strip all HTML # Looks for any expression that starts with < and ends with > and # does not have any < or > in the tag and replace it with a space email_contents = re.sub('<[^<>]+>', ' ', email_contents) # Handle Numbers # Look for one or more characters between 0-9 email_contents = re.sub('[0-9]+', 'number', email_contents) # Handle URLS # Look for strings starting with http:// or https:// email_contents = re.sub(r'(http|https)://[^\s]+', 'httpaddr', email_contents) # Handle Email Addresses # Look for strings with @ in the middle email_contents = re.sub(r'[^\s]+@[^\s]+', 'emailaddr', email_contents) # Handle $ sign email_contents = re.sub('[$]+', 'dollar', email_contents) # Handle handle apostrophe email_contents = re.sub('[\']+', ' ', email_contents) # ========================== Tokenize Email =========================== # Tokenize and also get rid of any punctuation (any non alphanumeric characters) token_str = re.split(r'[\s]', email_contents) token_str = [re.sub('[^a-zA-Z0-9]', '', l) for l in token_str] # list comprehension # Remove empty strings from the list token_str = list(filter(None, token_str)) # Output the email to screen as well print('\n==== Processed Email ====\n') print(token_str, '\n') # Stem the word using the Porter Stemming algorithm porter_stemmer = PorterStemmer() word_stem = [] for word in token_str: word_stem.append(porter_stemmer.stem(word)) # Look up the word in the dictionary and add the index # to word_indices if found word_indices = [] for word in word_stem: if vocabList.get(word): # if it exists word_indices.append(vocabList.get(word)) return word_indices
def processEmail(email_contents): #PROCESSEMAIL preprocesses a the body of an email and #returns a list of word_indices # word_indices = PROCESSEMAIL(email_contents) preprocesses # the body of an email and returns a list of indices of the # words contained in the email. # # Load Vocabulary vocabList = getVocabList() # Init return value word_indices = [] # ========================== Preprocess Email =========================== # Find the Headers ( \n\n and remove ) # Uncomment the following lines if you are working with raw emails with the # full headers # hdrstart = strfind(email_contents, ([char(10) char(10)])); # email_contents = email_contents(hdrstart(1):end); # Lower case email_contents = email_contents.lower() # Strip all HTML # Looks for any expression that starts with < and ends with > and replace # and does not have any < or > in the tag it with a space email_contents = re.compile('<[^<>]+>').sub(' ', email_contents) # Handle Numbers # Look for one or more characters between 0-9 email_contents = re.compile('[0-9]+').sub(' number ', email_contents) # Handle URLS # Look for strings starting with http:// or https:// email_contents = re.compile('(http|https)://[^\\s]*').sub( ' httpaddr ', email_contents) # Handle Email Addresses # Look for strings with @ in the middle email_contents = re.compile('[^\\s]+@[^\\s]+').sub(' emailaddr ', email_contents) # Handle $ sign email_contents = re.compile('[$]+').sub(' dollar ', email_contents) # Other email_contents = re.split('[ @$/#.-:&*+=\\[\\]?!(){},' '\">_<;%\\n\\r]', email_contents) email_contents = [word for word in email_contents if len(word) > 0] # ========================== Tokenize Email =========================== # Output the email to screen as well print('\n==== Processed Email ====\n') # Process file stemmer = PorterStemmer() processed_email = [] for word in email_contents: word = re.compile('[^a-zA-Z0-9]').sub('', word).strip() word = stemmer.stem(word) processed_email.append(word) # Skip the word if it is too short if len(word) < 1: continue # Look up the word in the dictionary and add to word_indices if # found # ====================== YOUR CODE HERE ====================== # Instructions: Fill in this function to add the index of str to # word_indices if it is in the vocabulary. At this point # of the code, you have a stemmed word from the email in # the variable str. You should look up str in the # vocabulary list (vocabList). If a match exists, you # should add the index of the word to the word_indices # vector. Concretely, if str = 'action', then you should # look up the vocabulary list to find where in vocabList # 'action' appears. For example, if vocabList{18} = # 'action', then, you should add 18 to the word_indices # vector (e.g., word_indices = [word_indices ; 18]; ). # # Note: vocabList{idx} returns a the word with index idx in the # vocabulary list. # # Note: You can use strcmp(str1, str2) to compare two strings (str1 and # str2). It will return 1 only if the two strings are equivalent. # try: index = vocabList.index(word) except ValueError: pass else: word_indices.append(index) # ============================================================" print(' '.join(processed_email)) # Print footer print('\n\n=========================') return word_indices
def processEmail(email_contents): # Load Vocabulary vocabList = getVocabList() # Init return value word_indices = [] # ========================== Preprocess Email =========================== # Find the Headers ( \n\n and remove ) # Uncomment the following lines if you are working with raw emails with the # full headers # Lower case email_contents = email_contents.lower() # Strip all HTML # Looks for any expression that starts with < and ends with > and replace # and does not have any < or > in the tag it with a space email_contents = re.sub(r"<[^<>]+>", " ", email_contents) # Handle Numbers # Look for one or more characters between 0-9 email_contents = re.sub(r"[0-9]+", "number", email_contents) # Handle URLS # Look for strings starting with http:// or https:// email_contents = re.sub(r"(http|https)://[^\s]*", "httpaddr", email_contents) # Handle Email Addresses # Look for strings with @ in the middle email_contents = re.sub(r"[^\s]+@[^\s]+", "emailaddr", email_contents) # Handle $ sign email_contents = re.sub(r"[$]+", "dollar", email_contents) # ========================== Tokenize Email =========================== # Output the email to screen as well print('\n==== Processed Email ====\n\n') # Process file l = 0 strs = re.split(r'[ `\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>?\n\t]', email_contents) p = PorterStemmer() for _str in strs: # Remove any non alphanumeric characters _str = re.sub(r"[^a-zA-Z0-9]", "", _str) # Stem the word # (the porterStemmer sometimes has issues, so we use a try catch block) _str = p.stem(_str) # Skip the word if it is too short if len(_str) < 1: continue for i in range(len(vocabList)): if _str == vocabList[i]: word_indices.append(i) break if l + len(_str) + 1 > 78: print('') l = 0 print(_str+" ", end='') # Print footer print('\n\n=========================\n') return word_indices
print 'Test Accuracy: %f', np.mean(np.double(p == ytest.flatten())) * 100 ## ================= Part 5: Top Predictors of Spam ==================== # Since the model we are training is a linear SVM, we can inspect the # weights learned by the model to understand better how it is determining # whether an email is spam or not. The following code finds the words with # the highest weights in the classifier. Informally, the classifier # 'thinks' that these words are the most likely indicators of spam. # Sort the weights and obtain the vocabulary list t = sorted(list(enumerate(model.coef_[0])), key=lambda e: e[1], reverse=True) d = OrderedDict(t) idx = d.keys() weight = d.values() vocabList = getVocabList() print 'Top predictors of spam: ' for i in range(15): print ' %-15s (%f)' % (vocabList[idx[i]], weight[i]) print 'Program paused. Press enter to continue.' ## =================== Part 6: Try Your Own Emails ===================== # Now that you've trained the spam classifier, you can use it on your own # emails! In the starter code, we have included spamSample1.txt, # spamSample2.txt, emailSample1.txt and emailSample2.txt as examples. # The following code reads in one of these emails and then uses your # learned SVM classifier to determine whether the email is Spam or # Not Spam
def processEmail(email_contents): vocabList = getVocabList() word_indices = [] # ========================== Preprocess Email =========================== # Find the Headers ( \n\n and remove ) # Uncomment the following lines if you are working with raw emails with the # full headers # hdrstart = email_contents.find("\n\n") # if hdrstart: # email_contents = email_contents[hdrstart:] # Lower case email_contents = email_contents.lower() # Strip all HTML # Looks for any expression that starts with < and ends with > and replace # and does not have any < or > in the tag it with a space email_contents = re.sub('<[^<>]+>', ' ', email_contents) # Handle Numbers # Look for one or more characters between 0-9 email_contents = re.sub('[0-9]+', 'number', email_contents) # Handle URLS # Look for strings starting with http:// or https:// email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents) # Handle Email Addresses # Look for strings with @ in the middle email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents) # Handle $ sign email_contents = re.sub('[$]+', 'dollar', email_contents) # Tokenize and get rid of any punctuation # [str, email_contents] = ... # strtok(email_contents, ... # [' @$/#.-:&*+=[]?!(){},''">_<;%' char(10) char(13)]); email_contents = re.split(r'[@$/#.-:&\*\+=\[\]?!(){},\'\'\">_<;%\s]+', email_contents) # print(email_contents) # Output the email to screen as well #print('\n==== Processed Email ====\n\n') # Process file l = 0 for token in email_contents: # Remove any non alphanumeric characters token = re.sub('[^a-zA-Z0-9]', '', token) # Stem the word token = PorterStemmer().stem(token.strip()) # Skip the word if it is too short if len(token) < 1: continue idx = vocabList[token] if token in vocabList else 0 # only add entries which are in vocabList # i.e. those with ind ~= 0, # given that ind is assigned 0 if str is not found in vocabList if idx > 0: word_indices.append(idx) # Print to screen, ensuring that the output lines are not too long if l + len(token) + 1 > 78: print("") l = 0 print(token) l = l + len(token) + 1 # Print footer #print('\n\n=========================\n') return word_indices