def output(partId):
    # Random Test Cases
    x1 = np.sin(np.arange(1, 11))
    x2 = np.cos(np.arange(1, 11))
    ec = 'the quick brown fox jumped over the lazy dog'
    wi = np.abs(np.round(x1 * 1863)).astype(int)
    wi = np.concatenate([wi, wi])
    if partId == '1':
        sim = gaussianKernel(x1, x2, 2)
        out = formatter('%0.5f ', sim)
    elif partId == '2':
        mat = scipy.io.loadmat('ex6data3.mat')
        X = mat['X']
        y = mat['y'].ravel()
        Xval = mat['Xval']
        yval = mat['yval'].ravel()
        C, sigma = dataset3Params(X, y, Xval, yval)
        out = formatter('%0.5f ', C)
        out += formatter('%0.5f ', sigma)
    elif partId == '3':
        word_indices = processEmail(ec) + 1
        out = formatter('%d ', word_indices)
    elif partId == '4':
        x = emailFeatures(wi)
        out = formatter('%d ', x)
    return out
Exemplo n.º 2
0
from emailFeatures import emailFeatures
from getVocabList import getVocabList

## ==================== Part 1: Email Preprocessing ====================
#  To use an SVM to classify emails into Spam v.s. Non-Spam, you first need
#  to convert each email into a vector of features. In this part, you will
#  implement the preprocessing steps for each email. You should
#  complete the code in processEmail.m to produce a word indices vector
#  for a given email.

print 'Preprocessing sample email (emailSample1.txt)'

# Extract Features
file = open('emailSample1.txt', 'r')
file_contents = file.readlines()
word_indices = processEmail(''.join(file_contents))

# Print Stats
print 'Word Indices: '
print word_indices

raw_input("Program paused. Press Enter to continue...")

## ==================== Part 2: Feature Extraction ====================
#  Now, you will convert each email into a vector of features in R^n.
#  You should complete the code in emailFeatures.m to produce a feature
#  vector for a given email.

print 'Extracting features from sample email (emailSample1.txt)'

# Extract Features
Exemplo n.º 3
0
from linearKernel import linearKernel
from svmPredict import svmPredict

## ==================== Part 1: Email Preprocessing ====================
#  To use an SVM to classify emails into Spam v.s. Non-Spam, you first need
#  to convert each email into a vector of features. In this part, you will
#  implement the preprocessing steps for each email. You should
#  complete the code in processEmail.m to produce a word indices vector
#  for a given email.

print 'Preprocessing sample email (emailSample1.txt)'

# Extract Features
with open('emailSample1.txt') as f:
    file_contents = f.read()
word_indices = processEmail(file_contents)

# Print Stats
print 'Word Indices:'
print word_indices
print

print 'Program paused. Press enter to continue.'
raw_input()

## ==================== Part 2: Feature Extraction ====================
#  Now, you will convert each email into a vector of features in R^n.
#  You should complete the code in emailFeatures.py to produce a feature
#  vector for a given email.

print '\nExtracting features from sample email (emailSample1.txt)'
Exemplo n.º 4
0
import numpy as np
import scipy.io as sio   # Used to load the OCTAVE *.mat files
from sklearn import svm
from processEmail import processEmail, getVocabList

## ==================== Part 1: Email Preprocessing ====================
#  To use an SVM to classify emails into Spam v.s. Non-Spam, you first need
#  to convert each email into a vector of features. In this part, you will
#  implement the preprocessing steps for each email. You should
#  complete the code in processEmail.m to produce a word indices vector
#  for a given email.
print("Preprocessing and extracting features sample email (emailSample1.txt)")

# Extract Features
with open('emailSample1.txt', 'r') as f:
    features =processEmail(f.read())

print('length of vector = {}\nnum of non-zero = {}'
        .format(len(features), int(features.sum())))
print(features.shape)

## =========== Part 2: Train Linear SVM for Spam Classification ========
#  In this section, you will train a linear classifier to determine if an
#  email is Spam or Not-Spam.

# Load the Spam Email dataset
# You will have X, y in your environment
mat = sio.loadmat('spamTrain.mat')
X, y = mat['X'], mat['y']

print("\nTraining Linear SVM (Spam Classification)")
from emailFeatures import emailFeatures

#2.1
email_contents = open("emailSample1.txt","r").read()
vocabList =  open("vocab.txt","r").read()



#2.1.1
vocabList=vocabList.split("\n")[:-1]
vocabList_d={}
for ea in vocabList:
    value,key = ea.split("\t")[:]
    vocabList_d[key] = value

word_indices = processEmail(email_contents , vocabList_d)

#2.2

featureVector = emailFeatures(word_indices)
#print(np.sum(featureVector))

#2.3
spamTrainData = loadmat('spamTrain.mat')
#print(spamTrainData)

X = spamTrainData['X']
y = spamTrainData['y']

print('>   ' ,spamTrainData)
Exemplo n.º 6
0
import numpy as np


def emailFeatures(word_indices):
    n = 1899
    x = np.zeros((n, 1))

    for word_indice in word_indices:
        x[word_indice] = 1

    return x.T


if __name__ == '__main__':
    f = open('emailSample1.txt')
    email_contents = f.read()
    from processEmail import processEmail
    word_indices = processEmail(email_contents)
    print word_indices
    print len(word_indices)
    features = emailFeatures(word_indices)
    print len(features)

    print np.sum(features)
Exemplo n.º 7
0
def extractFeature(mails):
    return np.array([processEmail(mail, vocabList) for mail in mails])
Exemplo n.º 8
0
#

# ==================== Part 1: Email Preprocessing ====================
#  To use an SVM to classify emails into Spam v.s. Non-Spam, you first need
#  to convert each email into a vector of features. In this part, you will
#  implement the preprocessing steps for each email. You should
#  complete the code in processEmail.m to produce a word indices vector
#  for a given email.

ml_dir = '/Users/gregory/Desktop/me/coursera/machine_learning/ml_python/machine-learning-ex6/ex6/'
fname = ml_dir + 'emailSample1.txt'
with open(fname) as f:
    file_contents = f.readlines()

# Extract Features
word_indices  = processEmail(file_contents)

# Print Stats
print('Word Indices: \n')
print(word_indices)


# ==================== Part 2: Feature Extraction ====================
#  Now, you will convert each email into a vector of features in R^n.
#  You should complete the code in emailFeatures.m to produce a feature
#  vector for a given email.

# Extract Features
#file_contents = readFile('emailSample1.txt');
#word_indices  = processEmail(file_contents);
vocab_length = len(getVocabList())
Exemplo n.º 9
0
def main() :

    path = os.getcwd()
    path = os.path.join(path,'dataSets')

# ===============  Part 1 ====================
    #  To use an SVM to classify emails into Spam v.s. Non-Spam, we first need
    #  to convert each email into a vector of features. In this part, we
    #  implement the preprocessing steps for each email. 

    f = open(os.path.join(path,"emailSample1.txt"),'r')
    email_contents = f.read()
    f.close() 

    print(email_contents)

    word_indices = processEmail.processEmail(email_contents)
    features = emailFeatures(word_indices)
    
    print('Word Indices :\n')
    print(word_indices, "\n")


#=============  Part 2  =======================
    # Print Stats

    print('Length of feature vector: %d\n'  % len(features))
    print('Number of non-zero entries: %d\n' % sum(features))



# =============  Part 3  ======================

    #  In this section, we will train a linear classifier to determine if an
    #  email is Spam or Not-Spam.


    print('\n\nRunning SVM on training set...')

    mat = io.loadmat(os.path.join(path,'spamTrain.mat'))

    X = mat['X']
    y = mat['y']
    
    y = numpy.ravel(y)

    model = svm.SVC(C = 0.1, kernel='linear')
    model.fit(X, y)

    p = model.predict(X)

    accuracy = model.score(X, y)
    accuracy *= 100.0

    print('\nTraining Accuracy: %.2f' % accuracy)
   
#================ Part 4 ========================

    # Xtest and ytest are the env. variables
    mat = io.loadmat(os.path.join(path,'spamTest.mat'))

    XTest = mat['Xtest']
    yTest = mat['ytest']
    
    yTest = numpy.ravel(yTest)

    p = model.predict(XTest)

    accuracy = model.score(XTest,yTest)
    accuracy *= 100.0

    print('\nTest Accuracy: %.2f' % accuracy)
   
#================ Part 5 ============================
    #  Since the model we are training is a linear SVM, we can inspect the
    #  weights learned by the model to understand better how it is determining
    #  whether an email is spam or not. The following code finds the words with
    #  the highest weights in the classifier. Informally, the classifier
    #  'thinks' that these words are the most likely indicators of spam.
    

    print('\nTop spam predictors (keywords) \n')

    z = model.coef_
    z = numpy.ravel(z)

    vocabList = getVocabList.getVocabList()

    dic = {}
    for i in range(len(z)) :
        dic[ vocabList[i] ] = z[i]

    cnt = 0
    for w in sorted(dic, key=dic.get, reverse=True):
      if cnt == 15 :
          break 
      cnt = cnt + 1
      print('{0:10} - {1:10f}'.format(w, dic[w]))

    print('\n\n')

# ============ Part 6: Test a sample Email =====================
    #  Now that we have trained the spam classifier, we can use it on our own
    #  emails! 
    #  The following code reads in one of these emails and then uses our 
    #  learned SVM classifier to determine whether the email is Spam or 
    #  Not Spam
    
    f = open(os.path.join(path, "spamSample1.txt"),'r')
    email_contents = f.read()
    f.close() 

    print('Sample Email : ')
    print(email_contents)

    word_indices = processEmail.processEmail(email_contents)
    
    features = emailFeatures(word_indices)
    X = emailFeatures(word_indices);
    
    p = model.predict(X)

    print('\nEmail Processed\n\nSpam Classification: %d\n' % p);
    print('(1 indicates spam, 0 indicates not spam)\n\n');
from emailFeatures import emailFeatures
from getVocabList import getVocabList

## ==================== Part 1: Email Preprocessing ====================
#  To use an SVM to classify emails into Spam v.s. Non-Spam, you first need
#  to convert each email into a vector of features. In this part, you will
#  implement the preprocessing steps for each email. You should
#  complete the code in processEmail.m to produce a word indices vector
#  for a given email.

print 'Preprocessing sample email (emailSample1.txt)'

# Extract Features
file = open('ex6/emailSample1.txt', 'r')
file_contents = file.readlines()
word_indices  = processEmail(''.join(file_contents))

# Print Stats
print 'Word Indices: '
print word_indices

#raw_input("Program paused. Press Enter to continue...")

## ==================== Part 2: Feature Extraction ====================
#  Now, you will convert each email into a vector of features in R^n.
#  You should complete the code in emailFeatures.m to produce a feature
#  vector for a given email.

print 'Extracting features from sample email (emailSample1.txt)'

# Extract Features
Exemplo n.º 11
0

# Empty dictionary
vocabList_d={} 
for element_of_list in vocabList:
    value, key = element_of_list.split("\t")[:]
    vocabList_d[key] = value



print(file_contents)




word_indices= processEmail.processEmail(file_contents,vocabList_d)



features = emailFeatures.emailFeatures(word_indices,vocabList_d)
print("Length of feature vector: ", len(features))
print("Number of non-zero entries: ", np.sum(features))





spam_mat = loadmat("Data/spamTrain.mat")
X_train =spam_mat["X"]
y_train = spam_mat["y"]
Exemplo n.º 12
0

from processEmail import processEmail

file = open('data/spamSample2.txt', mode='r')

# read all lines at once
all_of_it = file.read()
processEmail(all_of_it)

Exemplo n.º 13
0
def ex6_spam():
    ## Machine Learning Online Class
    #  Exercise 6 | Spam Classification with SVMs
    #
    #  Instructions
    #  ------------
    # 
    #  This file contains code that helps you get started on the
    #  exercise. You will need to complete the following functions:
    #
    #     gaussianKernel.m
    #     dataset3Params.m
    #     processEmail.m
    #     emailFeatures.m
    #
    #  For this exercise, you will not need to change any code in this file,
    #  or any other files other than those mentioned above.
    #

    ## Initialization
    #clear ; close all; clc

    ## ==================== Part 1: Email Preprocessing ====================
    #  To use an SVM to classify emails into Spam v.s. Non-Spam, you first need
    #  to convert each email into a vector of features. In this part, you will
    #  implement the preprocessing steps for each email. You should
    #  complete the code in processEmail.m to produce a word indices vector
    #  for a given email.

    print('\nPreprocessing sample email (emailSample1.txt)')

    # Extract Features
    file_contents = readFile('emailSample1.txt')
    word_indices  = processEmail(file_contents)

    # Print Stats
    print('Word Indices: ')
    print(formatter(' %d', np.array(word_indices) + 1))
    print('\n')

    print('Program paused. Press enter to continue.')
    #pause;

    ## ==================== Part 2: Feature Extraction ====================
    #  Now, you will convert each email into a vector of features in R^n. 
    #  You should complete the code in emailFeatures.m to produce a feature
    #  vector for a given email.

    print('\nExtracting features from sample email (emailSample1.txt)')

    # Extract Features
    file_contents = readFile('emailSample1.txt')
    word_indices  = processEmail(file_contents)
    features      = emailFeatures(word_indices)

    # Print Stats
    print('Length of feature vector: %d' % features.size)
    print('Number of non-zero entries: %d' % np.sum(features > 0))

    print('Program paused. Press enter to continue.')
    #pause;

    ## =========== Part 3: Train Linear SVM for Spam Classification ========
    #  In this section, you will train a linear classifier to determine if an
    #  email is Spam or Not-Spam.

    # Load the Spam Email dataset
    # You will have X, y in your environment
    mat = scipy.io.loadmat('spamTrain.mat')
    X = mat['X'].astype(float)
    y = mat['y'][:, 0]

    print('\nTraining Linear SVM (Spam Classification)\n')
    print('(this may take 1 to 2 minutes) ...\n')

    C = 0.1
    model = svmTrain(X, y, C, linearKernel)

    p = svmPredict(model, X)

    print('Training Accuracy: %f' % (np.mean(p == y) * 100))

    ## =================== Part 4: Test Spam Classification ================
    #  After training the classifier, we can evaluate it on a test set. We have
    #  included a test set in spamTest.mat

    # Load the test dataset
    # You will have Xtest, ytest in your environment
    mat = scipy.io.loadmat('spamTest.mat')
    Xtest = mat['Xtest'].astype(float)
    ytest = mat['ytest'][:, 0]

    print('\nEvaluating the trained Linear SVM on a test set ...\n')

    p = svmPredict(model, Xtest)

    print('Test Accuracy: %f\n' % (np.mean(p == ytest) * 100))
    #pause;


    ## ================= Part 5: Top Predictors of Spam ====================
    #  Since the model we are training is a linear SVM, we can inspect the
    #  weights learned by the model to understand better how it is determining
    #  whether an email is spam or not. The following code finds the words with
    #  the highest weights in the classifier. Informally, the classifier
    #  'thinks' that these words are the most likely indicators of spam.
    #

    # Sort the weights and obtin the vocabulary list
    idx = np.argsort(model['w'])
    top_idx = idx[-15:][::-1]
    vocabList = getVocabList()

    print('\nTop predictors of spam: ')
    for word, w in zip(np.array(vocabList)[top_idx], model['w'][top_idx]):
        print(' %-15s (%f)' % (word, w))
    #end

    print('\n')
    print('\nProgram paused. Press enter to continue.')
    #pause;

    ## =================== Part 6: Try Your Own Emails =====================
    #  Now that you've trained the spam classifier, you can use it on your own
    #  emails! In the starter code, we have included spamSample1.txt,
    #  spamSample2.txt, emailSample1.txt and emailSample2.txt as examples. 
    #  The following code reads in one of these emails and then uses your 
    #  learned SVM classifier to determine whether the email is Spam or 
    #  Not Spam

    # Set the file to be read in (change this to spamSample2.txt,
    # emailSample1.txt or emailSample2.txt to see different predictions on
    # different emails types). Try your own emails as well!
    filename = 'spamSample1.txt'

    # Read and predict
    file_contents = readFile(filename)
    word_indices  = processEmail(file_contents)
    x             = emailFeatures(word_indices)
    p = svmPredict(model, x.ravel())

    print('\nProcessed %s\n\nSpam Classification: %d' % (filename, p))
    print('(1 indicates spam, 0 indicates not spam)\n')