def predict_email_spam(_filename): _file_contents = open('../data/' + _filename).read() _word_indices = process_email(_file_contents) _features = email_features(_word_indices) _p = Classification.predict(_features.T) print('\nProcessed %s\n\nSpam Classification: %d\n' % (_filename, _p[0])) print('(1 indicates spam, 0 indicates not spam)\n\n')
print('\nProcessed %s\n\nSpam Classification: %d\n' % (_filename, _p[0])) print('(1 indicates spam, 0 indicates not spam)\n\n') if __name__ == '__main__': # ==================== Part 1: Email Preprocessing ==================== # To use an SVM to classify emails into Spam v.s. Non-Spam, you first need # to convert each email into a vector of features. In this part, you will # implement the preprocessing steps for each email. You should # complete the code in processEmail.m to produce a word indices vector # for a given email. print('\nPreprocessing sample email (emailSample1.txt)\n') # Extract Features file_contents = open('../data/emailSample1.txt').read() word_indices = process_email(file_contents) # Print Stats print('Word Indices:\n') print_index = 0 for print_value in word_indices: print_index += 1 print("%4d" % print_value, end=' ') if print_index % 10 == 0: print('\n') print('\n') print('Program paused. Press enter to continue.\n') # pause_func() # ==================== Part 2: Feature Extraction ==================== # Now, you will convert each email into a vector of features in R^n. # You should complete the code in emailFeatures.m to produce a feature
from processEmail import process_email from vocabulary import map_words # Pre-process email words = process_email( "<html>HELLO world pythoner! It's $100, how are \n you're \t doing? http://ya.ru</html>" ) # For each word in the email find it's index in the vocabulary words_indices = map_words(words) # Train classifier # todo # Predict # todo # Analyze # todo
def classify_email(filename): x = email_features(process_email(read_file(filename), 'vocab.txt')).reshape(1, -1) pred = model.predict(x) print('\nProcessed {}\n\nSpam Classification: {}\n'.format(filename, pred)) print('(1 indicates spam, 0 indicates not spam)\n\n')
def pause(): input("") """## Part 1: Email Pre-processing To use an SVM to classify emails into Spam v.s. Non-Spam, you first need to convert each email into a vector of features. In this part, you will implement the pre-processing steps for each email. You should complete the code in processEmail.py to produce a word indices vector for a given email.""" print('\nPre-processing sample email (emailSample1.txt)\n') # Extract Features file_contents = read_file('emailSample1.txt') word_indices = process_email(file_contents, 'vocab.txt') # Print Stats print('Word Indices: \n') print(word_indices) print('\n\n') print('Program paused. Press enter to continue.\n') pause() """ ## Part 2: Feature Extraction Convert each email into a vector of features in R^n. """ print('\nExtracting features from sample email (emailSample1.txt)\n')