Пример #1
0
def read_file(path):

    print("Leyendo archivos de ", path)

    files = []
    # r=root, d=directories, f = files
    for r, d, f in os.walk(path):
        for file in f:
            if '.txt' in file:
                files.append(os.path.join(r, file))

    j = 0
    vocab_dict = vocab.getVocabDict()

    arrayDict = np.array(list(vocab_dict.items()))

    X = np.zeros((len(files), len(arrayDict)))

    for f in files:
        email_contents = codecs.open(f, "r", encoding="utf−8", errors="ignore").read()

        email = process_email.email2TokenList(email_contents)

        aux = np.zeros(len(arrayDict))

        for i in range(len(email)):
            index = np.where(arrayDict[:, 0] == email[i])
            aux[index] = 1

        X[j] = aux
        j = j + 1

    print("Archivos de ", path, "leídos y guardados en X.")
    return X
Пример #2
0
def load_email_data(folder_name, numExamples):
    """
    Loads the data from a folder containing email examples
    and transforms them in a vectorized form suitable for svm training
    parameters :
        - folder_name string with the name of the folder containing the examples
        - numExamples the number of examples in the folder
    returns:
        - X a matrix of numExamples rows containing a 1 at coefficient (i,j)
          iff the (i-1)th example contained word (j-1) of the dictionnary
    """
    X = np.zeros((numExamples, nwords))
    for i in range(numExamples):
        email_contents = codecs.open(folder_name +
                                     '/{:0>4d}.txt'.format(i + 1),
                                     'r',
                                     encoding='utf-8',
                                     errors='ignore').read()
        email = email2TokenList(email_contents)
        for word in email:
            index = dictionnary.get(word)
            if index != None:
                X[i, index - 1] = 1

    return X
Пример #3
0
def read_easyHam():
    no_spam_emails = []
    directorio = "easy_ham"
    i = 1
    for no_spam in os.listdir(directorio):
        email_contents = codecs.open(
            '{0}/{1:04d}.txt'.format(directorio, i), 'r', encoding='utf-8', errors='ignore').read()
        tokens = email2TokenList(email_contents)
        tokens = convertToIndices(tokens)
        i += 1
        no_spam_emails.append(tokens)
    print("Easy Ham Readed: ", i-1)
    return no_spam_emails
Пример #4
0
def process_email(name, dicti):
    print(name)
    if name == "spam/0340.txt":
        return np.zeros(1899)
    email_contents = codecs.open(name, 'r', encoding='utf-8',
                                 errors='ignore').read()
    email = np.array(pm.email2TokenList(email_contents))
    index = np.vectorize(dicti.get)(email, -1)
    index = index[index != -1]
    index = index - 1
    vect = np.zeros(1899)
    vect[index] = 1
    return vect
Пример #5
0
def dataManager(ini, fin, directoryName, yValue):
    X = np.empty((0, 1899))  # 60% de 500
    Y = np.empty((0, 1))

    for i in range(ini + 1, fin + 1):
        email_contents = codecs.open('{0}/{1:04d}.txt'.format(
            directoryName, i),
                                     'r',
                                     encoding='utf-8',
                                     errors='ignore').read()
        email = email2TokenList(email_contents)

        wordsDict = getVocabDict()

        wordOcurrence = emailToWordOcurrence(email, wordsDict)
        X = np.vstack((X, wordOcurrence))
        Y = np.vstack((Y, yValue))

    return X, Y
Пример #6
0
def addX(directorio, X, Xval, Xtest, numFicheros, dicVoc):
    for i in range(numFicheros):
        email_contents = codecs.open('{0}/{1:04d}.txt'.format(
            directorio, i + 1),
                                     'r',
                                     encoding='utf-8',
                                     errors='ignore').read()
        tokens = email2TokenList(email_contents)
        arrayPalabras = np.zeros(1900)
        for palabra in tokens:
            if palabra in dicVoc:
                arrayPalabras[dicVoc[palabra]] = 1
        if (i + 1 <= int(numFicheros * 0.7)):
            X = np.vstack((X, arrayPalabras
                           ))  #le añadimos a la fila correspondiente el array
        elif (i + 1 <= int(numFicheros * 0.9)):
            Xval = np.vstack(
                (Xval, arrayPalabras
                 ))  #le añadimos a la fila correspondiente el array
        else:
            Xtest = np.vstack(
                (Xtest, arrayPalabras
                 ))  #le añadimos a la fila correspondiente el array
    return X, Xval, Xtest
Пример #7
0
plt.savefig('P1.3.png')
plt.show()

#%% Parte 2

# Cargamos el diccionario de palabras
dic = getVocabDict()

# Leemos y procesamos los datos correspondientes a spam
spam = np.zeros((len(os.listdir('spam')), len(dic)))
for i, filename in enumerate(os.listdir('spam')):
    email_contents = codecs.open('spam/' + filename,
                                 'r',
                                 encoding='utf-8',
                                 errors='ignore').read()
    email_tokens = email2TokenList(email_contents)
    for token in email_tokens:
        if token in dic.keys():
            spam[i, dic[token] - 1] = 1

# Leemos y procesamos los datos correspondientes al easy ham
easy = np.zeros((len(os.listdir('easy_ham')), len(dic)))
for i, filename in enumerate(os.listdir('easy_ham')):
    email_contents = codecs.open('easy_ham/' + filename,
                                 'r',
                                 encoding='utf-8',
                                 errors='ignore').read()
    email_tokens = email2TokenList(email_contents)
    for token in email_tokens:
        if token in dic.keys():
            easy[i, dic[token] - 1] = 1