def read_file(path): print("Leyendo archivos de ", path) files = [] # r=root, d=directories, f = files for r, d, f in os.walk(path): for file in f: if '.txt' in file: files.append(os.path.join(r, file)) j = 0 vocab_dict = vocab.getVocabDict() arrayDict = np.array(list(vocab_dict.items())) X = np.zeros((len(files), len(arrayDict))) for f in files: email_contents = codecs.open(f, "r", encoding="utf−8", errors="ignore").read() email = process_email.email2TokenList(email_contents) aux = np.zeros(len(arrayDict)) for i in range(len(email)): index = np.where(arrayDict[:, 0] == email[i]) aux[index] = 1 X[j] = aux j = j + 1 print("Archivos de ", path, "leídos y guardados en X.") return X
def load_email_data(folder_name, numExamples): """ Loads the data from a folder containing email examples and transforms them in a vectorized form suitable for svm training parameters : - folder_name string with the name of the folder containing the examples - numExamples the number of examples in the folder returns: - X a matrix of numExamples rows containing a 1 at coefficient (i,j) iff the (i-1)th example contained word (j-1) of the dictionnary """ X = np.zeros((numExamples, nwords)) for i in range(numExamples): email_contents = codecs.open(folder_name + '/{:0>4d}.txt'.format(i + 1), 'r', encoding='utf-8', errors='ignore').read() email = email2TokenList(email_contents) for word in email: index = dictionnary.get(word) if index != None: X[i, index - 1] = 1 return X
def read_easyHam(): no_spam_emails = [] directorio = "easy_ham" i = 1 for no_spam in os.listdir(directorio): email_contents = codecs.open( '{0}/{1:04d}.txt'.format(directorio, i), 'r', encoding='utf-8', errors='ignore').read() tokens = email2TokenList(email_contents) tokens = convertToIndices(tokens) i += 1 no_spam_emails.append(tokens) print("Easy Ham Readed: ", i-1) return no_spam_emails
def process_email(name, dicti): print(name) if name == "spam/0340.txt": return np.zeros(1899) email_contents = codecs.open(name, 'r', encoding='utf-8', errors='ignore').read() email = np.array(pm.email2TokenList(email_contents)) index = np.vectorize(dicti.get)(email, -1) index = index[index != -1] index = index - 1 vect = np.zeros(1899) vect[index] = 1 return vect
def dataManager(ini, fin, directoryName, yValue): X = np.empty((0, 1899)) # 60% de 500 Y = np.empty((0, 1)) for i in range(ini + 1, fin + 1): email_contents = codecs.open('{0}/{1:04d}.txt'.format( directoryName, i), 'r', encoding='utf-8', errors='ignore').read() email = email2TokenList(email_contents) wordsDict = getVocabDict() wordOcurrence = emailToWordOcurrence(email, wordsDict) X = np.vstack((X, wordOcurrence)) Y = np.vstack((Y, yValue)) return X, Y
def addX(directorio, X, Xval, Xtest, numFicheros, dicVoc): for i in range(numFicheros): email_contents = codecs.open('{0}/{1:04d}.txt'.format( directorio, i + 1), 'r', encoding='utf-8', errors='ignore').read() tokens = email2TokenList(email_contents) arrayPalabras = np.zeros(1900) for palabra in tokens: if palabra in dicVoc: arrayPalabras[dicVoc[palabra]] = 1 if (i + 1 <= int(numFicheros * 0.7)): X = np.vstack((X, arrayPalabras )) #le añadimos a la fila correspondiente el array elif (i + 1 <= int(numFicheros * 0.9)): Xval = np.vstack( (Xval, arrayPalabras )) #le añadimos a la fila correspondiente el array else: Xtest = np.vstack( (Xtest, arrayPalabras )) #le añadimos a la fila correspondiente el array return X, Xval, Xtest
plt.savefig('P1.3.png') plt.show() #%% Parte 2 # Cargamos el diccionario de palabras dic = getVocabDict() # Leemos y procesamos los datos correspondientes a spam spam = np.zeros((len(os.listdir('spam')), len(dic))) for i, filename in enumerate(os.listdir('spam')): email_contents = codecs.open('spam/' + filename, 'r', encoding='utf-8', errors='ignore').read() email_tokens = email2TokenList(email_contents) for token in email_tokens: if token in dic.keys(): spam[i, dic[token] - 1] = 1 # Leemos y procesamos los datos correspondientes al easy ham easy = np.zeros((len(os.listdir('easy_ham')), len(dic))) for i, filename in enumerate(os.listdir('easy_ham')): email_contents = codecs.open('easy_ham/' + filename, 'r', encoding='utf-8', errors='ignore').read() email_tokens = email2TokenList(email_contents) for token in email_tokens: if token in dic.keys(): easy[i, dic[token] - 1] = 1