def get_word_data_from_email(email_addr, n_counter=10): "Get word data given email address" # print email DIR = "../dataset/emails_by_address" try: f_emails = open("{dir}/from_{addr}.txt".format(addr=email_addr, dir=DIR), "r") except IOError: return "NaN" counter = 0 word_data = [] for path in f_emails: counter += 1 if counter > n_counter: break path = "{dir}/".format(dir=DIR) + path[:-1] # print path email = open(path, "r") ### use parseOutText to extract the text from the opened email words = parseOutText(email).replace("\n", "").replace("\r", "") word_data.append(words) document = " ".join(word_data) return document
def addFeatures(data_dict, features_list): emaillistings = os.listdir(emails_by_address_path) for person, features in data_dict.iteritems(): mentions = 0 eaddr = features['email_address'] listing = "from_" + eaddr + ".txt" if listing in emaillistings: emailfiles = open(emails_by_address_path + '/' + listing, 'r') for emailfilepath in emailfiles: #emailfilepath = emailfilepath.replace('enron_mail_20110402/','../') email = open(emailfilepath.strip()) words = parseOutText(email) # contains email address or names mentions += num_mentions(words, poiIdentifiers(), person) print person + " : " + str(mentions) features['mentions'] = mentions features_list.append('mentions') return (data_dict, features_list)
def getSingleWordData(email): fn = "from_" + email + ".txt" words = "" try: from_file = open("./emails_by_address/" + fn, 'r') for path in from_file: path = path[path.index('maildir'):] path = os.path.join('..', path[:-1]) email = open(path, 'r') words = parseOutText(email) email.close() from_file.close() except IOError: return [] return [words]
def updateWordData(email,label,word_data,word_data_label): fn = "from_" + email + ".txt" try: from_file = open("./emails_by_address/" + fn, 'r') for path in from_file: path = path[path.index('maildir'):] path = os.path.join('..', path[:-1]) email = open(path, 'r') words = parseOutText(email) word_data.append(words) word_data_label.append(label) email.close() from_file.close() except IOError: pass
def get_emails(email_addr, sig_words, n_emails, to=True): '''get email word data for a person using email address. To get emails *to* this person, set to=True, To get emails *from* this person, set to=False. ''' emails = [] addr_dir = 'emails_by_address' pre = 'to_' if to else 'from_' filename = pre + email_addr + '.txt' path = addr_dir + '/' + filename with open(path, 'r') as f: temp_email_locs = f.readlines() # get lines that have valid email locations email_locs = [] for loc in temp_email_locs: if 'maildir' in loc: maildir_idx = loc.index('maildir') # You may need to modify this line if you have emails in a # different location or with a different naming format new_loc = 'C:/enron/' + loc[maildir_idx:-1].replace('.','_') email_locs.append(new_loc) # if n_emails is specified as 'all', check all the emails # if there aren't n_emails to check, check as many as we can if n_emails == 'all' or len(email_locs) < n_emails: n_emails = len(email_locs) # shuffle the email list so we can randomly choose n_emails shuffle(email_locs) email_locs = email_locs[:n_emails] # get email contents for loc in email_locs: try: email = open(loc, 'r') except: # just going to skip to the next email if one won't open print "\nTried and failed to open %s" % loc continue text = parseOutText(email) # close the file after we got the text out of it email.close() # remove instances of signature words for sig_word in sig_words: text = text.replace(sig_word, '') emails.append(text) # return the parsed emails as one big string return ' '.join(emails)
def vectorize_emails(from_sara, from_chris, words_to_remove = [], max_emails = False): ''' This function can be used independently. It takes as input the file pointers from_sara and from_chris, mandatory, and optionally: a list of words to remove (if not sent, it won't remove any words from the emails) and a max_emails (if parameter not sent, it uses all emails available, if max_emails is larger than the number of emails available, it uses all available emails) ''' from_data = [] word_data = [] ### temp_counter is a way to speed up the development--there are ### thousands of emails from Sara and Chris, so running over all of them ### can take a long time ### temp_counter helps you only look at the first 200 emails in the list so you ### can iterate your modifications quicker for name, from_person in [("sara", from_sara), ("chris", from_chris)]: temp_counter = 0 for path in from_person: if max_emails and temp_counter == max_emails: break temp_counter += 1 path = os.path.join('..', path[:-1]) # print path email = open(path, "r") ### use parseOutText to extract the text from the opened email text = parseOutText(email) ### use str.replace() to remove any instances of the words for word in words_to_remove: text = text.replace(word, '') ### append the text to word_data word_data.append(text) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris from_data.append(str(int(name == 'sara'))) email.close() print "emails processed" from_sara.close() from_chris.close() return word_data, from_data
def extractTextData(email_list): from_data = [] word_data = [] emails_nf = [] person_emails = {} temp_counter = 0 for email_address, poi in email_list: try: from_person = open(os.path.join(emails_by_address_path, "from_"+email_address+".txt"), "r") word_data_from_person = [] for path in from_person: temp_counter += 1 if (temp_counter % 5000) == 0: print "{} parsed emails".format(temp_counter) path = os.path.join(mail_dir_path , path[:-1]) ### using parseOutText to extract the text from the opened email email = open(path, "r") text = parseOutText(email) ### append the text to word_data word_data.append(text) word_data_from_person.append(text) ### append a 0 to from_data if email is not from POI, and 1 if email is from a POI from_data.append(poi) email.close() person_emails[email_address] = word_data_from_person except: emails_nf.append(email_address) from_person.close() print "all emails processed: %s total" %temp_counter print "%s emails not found" %len(emails_nf) pickle.dump( word_data, open("enron_word_data.pkl", "w") ) pickle.dump( from_data, open("enron_email_authors.pkl", "w") ) pickle.dump( person_emails, open("emails_from_person.pkl", "w") )
def get_word_data(): for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset #temp_counter += 1 if temp_counter < 200: path = os.path.join('..', path[:-1]) print path email = open(path, "r") ### use parseOutText to extract the text from the opened email email_parsed = parseOutText(email) ### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] remove = ["sara", "shackleton", "chris", "germani", "sshacklensf", "cgermannsf"] for word in remove: email_parsed = email_parsed.replace(word,"") ### append the text to word_data word_data.append(email_parsed) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris if name == "sara": from_data.append(0) else: from_data.append(1) email.close() print "emails processed" from_sara.close() from_chris.close() pickle.dump( word_data, open("your_word_data.pkl", "w") ) pickle.dump( from_data, open("your_email_authors.pkl", "w") ) return word_data
#ler emails de cada colaborador que possui um endereco de email no dicionario #nesse processo a funcao parseOutText ja retira os stopwords print "Iniciando leitura dos emails" for name in data_dict: try: from_person = open( "emails_by_address/from_" + data_dict[name]["email_address"] + ".txt", "r") temp_counter = 0 words = "" for path in from_person: #por questao de tempo de processamento, so li 100 emails de cada pessoa if temp_counter < 100: path = os.path.join('..', path[20:len(path) - 1:]) email = open(path, "r") w = parseOutText(email) words = words + w email.close() temp_counter += 1 word_data.append(words) from_data.append(name) if data_dict[name]["poi"] == 1: p.append(1) else: p.append(0) from_person.close() except: continue print "emails processados" #transformar a lista de emails de cada colaborador em um vetor de palavras
### temp_counter helps you only look at the first 200 emails in the list temp_counter = 0 for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset temp_counter += 1 #if temp_counter < 200: path = os.path.join('..', path[:-1]) print path email = open(path, "r") ### use parseOutText to extract the text from the opened email email_parsed = parseOutText(email) ### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] email_parsed = string.replace(email_parsed, "sara", "") email_parsed = string.replace(email_parsed, "shackleton", "") email_parsed = string.replace(email_parsed, "chris", "") email_parsed = string.replace(email_parsed, "germani", "") ### append the text to word_data word_data.append(email_parsed) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris from_data.append(0) if name == "sara" else from_data.append(1) email.close() print "emails processed"
### temp_counter helps you only look at the first 200 emails in the list temp_counter = 0 for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset temp_counter += 1 #if temp_counter < 20000000000: if temp_counter < 5: path = os.path.join('..', path[:-1]) print path email = open(path, "r") emailText = parseOutText(email).replace("sara","").replace("shackleton","").replace("chris","").replace("germani","").replace("sshacklensf","").replace("cgermannsf","") ### use parseOutText to extract the text from the opened email ### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] ### append the text to word_data word_data.append(emailText) if name == 'sara': from_data.append(0) else: from_data.append(1) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris email.close()
for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset # temp_counter += 1 if temp_counter < 200: path = os.path.join('..', path[:-1]) # print path email = open(path, "r") ### use parseOutText to extract the text from the opened email parsed_email = parseOutText(email).encode('ascii') ### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] for outcast in ["sara", "shackleton", "chris", "germani"]: parsed_email = parsed_email.replace(outcast, "") ### append the text to word_data word_data.append(parsed_email) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris from_data.append((0 if name == "sara" else 1)) email.close() print "emails processed"
### can take a long time ### temp_counter helps you only look at the first 200 emails in the list temp_counter = 0 from nltk.corpus import * sw = stopwords.words('english') for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset ##temp_counter += 1 ##if temp_counter < 200: path = os.path.join('..', path[:-1]) print path email = open(path, "r") content = parseOutText(email) #for i in ["sara", "shackleton", "chris", "germani"]: # text.replace(i, "") stop_words = [ "sara", "shackleton", "chris", "germani", "sshacklensf", "cgermannsf" ] for w in stop_words: content = content.replace(w, "") ### append the text to word_data word_data.append(content) if name == "sara":
### can iterate your modifications quicker temp_counter = 0 for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset ### temp_counter += 1 if temp_counter < 200: path = os.path.join('..', path[:-1]) #print path email = open(path, "r") ### use parseOutText to extract the text from the opened email extract = parseOutText(email) ### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] identifier_words = ["sara", "shackleton", "chris", "germani"] for i in identifier_words: extract = extract.replace(i, "") ### append the text to word_data word_data.append(extract) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris if name == "sara": from_data.append(0) else: from_data.append(1) email.close()
### can iterate your modifications quicker temp_counter = 0 for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset # temp_counter += 1 if temp_counter < 200: path = os.path.join('..', path[:-1]) print path email = open(path, "r") ### use parseOutText to extract the text from the opened email stemmed_words = parseOutText(email) ### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] for word in ["sara", "shackleton", "chris", "germani"]: stemmed_words = stemmed_words.replace(word, "") ### append the text to word_data word_data.append(stemmed_words) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris if name == "sara": from_data.append(0) else: from_data.append(1) email.close() print "emails processed"
print '[\033[91m LOADING\033[0m ] \033[94m\033[1mEmails are processing right now...\033[0m' pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=17578).start() for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: # only look at first 200 emails when developing # once everything is working, remove this line to run over full dataset # temp_counter += 1 # if temp_counter < 200: #remove comment if you need 200 path = os.path.join('..', path[:-1]) # print '[\033[91m OK\033[0m ]' + path email = open(path, "r") # use parseOutText to extract the text from the opened email raw_txt = parseOutText(email) # use str.replace() to remove any instances of the words # ["sara", "shackleton", "chris", "germani"] unwanted_words = ["sara", "shackleton", "chris", "germani"] for word in unwanted_words: raw_txt = raw_txt.replace(word, "") # append the text to word_data word_data.append(raw_txt) # append a 0 to from_data if email is from Sara, and 1 if email is from Chris from_data.append(0) if (name == "sara") else from_data.append(1) email.close() pbar.update(len(word_data)) pbar.finish() print "[\033[92m OK\033[0m ] \033[94m\033[1mEmails Processed\033[0m" from_sara.close()
word_data = [] ### temp_counter - limit no. emails processed #temp_counter = 0 for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset #temp_counter += 1 #if temp_counter < 100: path = os.path.join('..', path[:-1]) print(path) email = open(path, "r") ### use parseOutText to extract the text from the opened email stemmed_email = parseOutText(email) ### use str.replace() to remove any instances of the words words_to_replace = ["sara", "shackleton", "chris", "germani", "sshacklensf", "cgermannsf"] for word in words_to_replace: if (word in stemmed_email): stemmed_email = stemmed_email.replace(word, "") #stemmed_removed_email = " ".join([word for word in stemmed_email.split() if word not in words_to_replace]) ### append the text to word_data word_data.append(stemmed_email) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris if name == "sara": from_data.append(0) else: from_data.append(1)
### can iterate your modifications quicker temp_counter = 0 for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset # temp_counter += 1 # if temp_counter < 200: path = os.path.join('..', path[:-1]) print path email = open(path, "r") ### use parseOutText to extract the text from the opened email parsedLine = parseOutText(email) ### use str.replace() to remove any instances of the words removeArr = ["sara", "shackleton", "chris", "germani"] for elem in removeArr: parsedLine = parsedLine.replace(elem, "") ### append the text to word_data word_data.append(parsedLine) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris if from_person == "sara": from_data.append(0) else: from_data.append(1) email.close() print "emails processed"
def dump_email_data(data_dict): directory = "emails_by_address/" counter = 0 word_data = {} ls = poiEmails() for key in ls: email = ls[key] path1 = directory + "from_" + email + ".txt" path2 = directory + "to_" + email + ".txt" words = "" try: f1 = open(path1, "r") ls1 = f1.readlines() f1.close() for path in ls1: path = "../" + path[:-1] f2 = open(path, "r") words = words + " " + (parseOutText(f2)) f2.close() except Exception: pass try: f1 = open(path2, "r") ls1 = f1.readlines() f1.close() for path in ls1: path = "../" + path[:-1] f2 = open(path, "r") words = words + " " + (parseOutText(f2)) f2.close() except Exception: pass if words != "": if key in word_data: word_data[key] = word_data[key] + " " + words else: word_data[key] = words del words for key in data_dict: email = data_dict[key]['email_address'] path1 = directory + "from_" + email + ".txt" path2 = directory + "to_" + email + ".txt" words = "" try: f1 = open(path1, "r") ls = f1.readlines() f1.close() for path in ls: path = "../" + path[:-1] f2 = open(path, "r") words = words + " " + (parseOutText(f2)) f2.close() except Exception: pass try: f1 = open(path2, "r") ls = f1.readlines() f1.close() for path in l1: path = "../" + path[:-1] f2 = open(path, "r") words = words + " " + (parseOutText(f2)) f2.close() except Exception: pass if words != "": if key in word_data: word_data[key] = word_data[key] + " " + words else: word_data[key] = words del words counter += 1 print counter pickle_out = open("email_data.pkl", "wb") pickle.dump(word_data, pickle_out) pickle_out.close() print "\nSuccess!\nEmail Data Fetched."
temp_counter = 0 for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: # only look at first 200 emails when developing # once everything is working, remove this line to run over full dataset path = os.path.join('../..', 'enron_dataset/' + path[:-1]) # use parseOutText to extract the text from the opened email # use str.replace() to remove any instances of the words # ["sara", "shackleton", "chris", "germani"] # append the text to word_data # append a 0 to from_data if email is from Sara, and 1 if email is from Chris try: email = open(path, "r") text = parseOutText( email) # parse every word of email with stemming text = str(text) text = text.replace('sara ', '') text = text.replace('shackleton ', '') text = text.replace('chris ', '') text = text.replace('germani ', '') word_data.append(text) if name == 'sara': from_data.append(0) else: from_data.append(1) email.close() except IOError: pass print word_data[147]
### can take a long time ### temp_counter helps you only look at the first 200 emails in the list temp_counter = 0 for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset #temp_counter += 1 #if temp_counter < 200: path = os.path.join('..', path[:-1]) print path email = open(path, "r") ### use parseOutText to extract the text from the opened email stemmed_text = parseOutText(email) ### use str.replace() to remove any instances of the words delete_words = [ "sara", "shackleton", "chris", "germani", 'sshacklensf', 'cgermannsf' ] for word in delete_words: stemmed_text = stemmed_text.replace(word, '').strip() ### append the text to word_data word_data.append(stemmed_text) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris if name == 'sara': from_data.append(0) else: from_data.append(1)
def processMails(p): np.set_printoptions(threshold=np.nan) sys.path.append("../tools/") from parse_out_email_text import parseOutText """ Starter code to process the emails from Sara and Chris to extract the features and get the documents ready for classification. The list of all the emails from Sara are in the from_sara list likewise for emails from Chris (from_chris) The actual documents are in the Enron email dataset, which you downloaded/unpacked in Part 0 of the first mini-project. If you have not obtained the Enron email corpus, run startup.py in the tools folder. The data is stored in lists and packed away in pickle files at the end. """ f_list = open(p, "r") from_data = [] word_data = [] ### temp_counter is a way to speed up the development--there are ### thousands of emails from Sara and Chris, so running over all of them ### can take a long time ### temp_counter helps you only look at the first 200 emails in the list so you ### can iterate your modifications quicker temp_counter = 0 n = 0 for path in f_list: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset if temp_counter < 100: temp_counter += 1 path = os.path.join('..', path[:-1]) print path email = open(path, "r") ### use parseOutText to extract the text from the opened email parsed_email = parseOutText(email) ### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] #words_to_remove = ["sara", "shackleton", "chris", "germani"] #print parsed_email #for word in words_to_remove: # parsed_email = parsed_email.replace(word+' ', "") #print parsed_email ### append the text to word_data word_data.append(parsed_email) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris n += 1 email.close() #print "Email not found" print n, "emails processed" f_list.close() return word_data
### temp_counter helps you only look at the first 200 emails in the list so you ### can iterate your modifications quicker temp_counter = 0 for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset #temp_counter += 1 #if temp_counter < 200: path = os.path.join('..', path[:-1]) email = open(path, "r") ### use parseOutText to extract the text from the opened email email_text = parseOutText(email) ### use str.replace() to remove any instances of the words remove = ["sara", "shackleton", "chris", "germani"] for word in remove: email_text = email_text.replace(word, "") ### append the text to word_data word_data.append(email_text) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris from_data.append(0 if name == "sara" else 1) email.close() print "emails processed" from_sara.close()
### thousands of emails from Sara and Chris, so running over all of them ### can take a long time ### temp_counter helps you only look at the first 200 emails in the list temp_counter = 0 for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset temp_counter += 1 #if temp_counter < 200: path = os.path.join('..', path[:-1]) print "In: ", path email = open(path, "r") words_stem = str(parseOutText(email)) ### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] for word in sw: if(word in words_stem): words_stem = words_stem.replace(word, "") word_data.append(words_stem)### append a 0 to from_data if email is from Sara, and 1 if email is from Chris if name is "sara": from_data.append("0") elif name is "chris": from_data.append("1") email.close() print "All emails processed" from_sara.close() from_chris.close()
### temp_counter helps you only look at the first 200 emails in the list so you ### can iterate your modifications quicker temp_counter = 0 for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset temp_counter += 1 if temp_counter > 0: path = os.path.join('..', path[:-1]) print path email = open(path, "r") ### use parseOutText to extract the text from the opened email extract_text = parseOutText(email) ### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] for w in [ "sara", "shackleton", "chris", "germani", "sshacklensf", "cgermannsf" ]: extract_text = extract_text.replace(w, '') ### append the text to word_data word_data.append(extract_text) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris if name == 'sara': from_data.append(0) else: from_data.append(1)
### temp_counter helps you only look at the first 200 emails in the list so you ### can iterate your modifications quicker temp_counter = 0 for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset temp_counter += 1 if temp_counter < 200: path = os.path.join('..', path[:-1]) print path email = open(path, "r") ### use parseOutText to extract the text from the opened email email_body = parseOutText(email) ### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] signature_words = ["sara", "shackleton", "chris", "germani","sshacklensf","cgermannsf"] ### append the text to word_data for word in signature_words: if word in email_body: email_body=email_body.replace(word,"") word_data.append(email_body) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris if from_person == from_sara : from_data.append(0) else: from_data.append(1)
### can take a long time ### temp_counter helps you only look at the first 200 emails in the list so you ### can iterate your modifications quicker temp_counter = 0 for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset path = os.path.join('..', path[:-1]) print (path) email = open(path, "r") ### use parseOutText to extract the text from the opened email parsed_email = parseOutText(email) ### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] parsed_email.replace("sara", "") parsed_email.replace("shackleton", "") parsed_email.replace("chris", "") parsed_email.replace("germani", "") word_data.append(parsed_email) if from_person == "sara": from_data.append(0) elif from_person == "chris": from_data.append(1)
###temp_counter += 1 ###if temp_counter < 200: path = os.path.join('..', path[:-1]) print path email = open(path, "r") ### use parseOutText to extract the text from the opened email ### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] ### append the text to word_data ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris test = parseOutText(email) test = test.replace("sara", "").replace("shackleton", "").replace("chris", "").replace("germani", "") test = test.replace("sshacklensf", "").replace("cgermannsf", "") word_data.append(test) from_data.append(0 if name == "sara" else 1) email.close() print "emails processed" from_sara.close() from_chris.close() pickle.dump( word_data, open("your_word_data.pkl", "w") )
### can iterate your modifications quicker temp_counter = 0 for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset temp_counter += 1 if temp_counter < 500000: path = os.path.join('..', path[:-1]) print path try: email = open(path, "r") ### use parseOutText to extract the text from the opened email parsedText = parseOutText(email) ### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani, sshacklensf, cgermannsf"] parsedText = parsedText.replace("sara", "") parsedText = parsedText.replace("shackleton", "") parsedText = parsedText.replace("chris", "") parsedText = parsedText.replace("germani", "") parsedText = parsedText.replace("sshacklensf", "") parsedText = parsedText.replace("cgermannsf", "") ### append the text to word_data word_data.append(parsedText) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris if name == 'sara':
### can iterate your modifications quicker temp_counter = 0 for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset temp_counter += 1 # if temp_counter < 200: path = os.path.join('..', path[:-1]) email = open(path, "r") ### use parseOutText to extract the text from the opened email import parse_out_email_text stememail = parse_out_email_text.parseOutText(email) # print stememail ### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] removeWrods = ["sara", "shackleton", "chris", "germani", "sshacklensf", "cgermannsf"] for rw in removeWrods: stememail = stememail.replace(rw,'') # print stememail ### append the text to word_data word_data.append(stememail) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris if name=='sara': from_data.append(0) else: from_data.append(1)
### thousands of emails from Sara and Chris, so running over all of them ### can take a long time ### temp_counter helps you only look at the first 200 emails in the list so you ### can iterate your modifications quicker for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset path = os.path.join('../../enron_mail_20150507/enron_mail_20150507', path[:-1]) print path email = open(path, "r") ### use parseOutText to extract the text from the opened email words = parseOutText(email) ### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] words = words.replace("sara", "") words = words.replace("shackleton", "") words = words.replace("chris", "") words = words.replace("germani", "") words = words.replace("sshacklensf", "") words = words.replace("cgermannsf", "") ### append the text to word_data word_data.append(words) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris if name == "sara": from_data.append(0)
### can iterate your modifications quicker temp_counter = 0 for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset # temp_counter += 1 if temp_counter < 10000: path = os.path.join('..', path[:-1]) print path email = open(path, "r") ### use parseOutText to extract the text from the opened email string = parseOutText(email) ### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] for i in ["sara", "shackleton", "chris", "germani", 'sshacklensf', 'cgermannsf']: string = string.replace(i, '') # string = string.replace('\r\n', ' ') # string = string.replace(' ', ' ') ### append the text to word_data word_data.append(string) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris if name == "sara": from_data.append(0) else:
### temp_counter helps you only look at the first 200 emails in the list so you ### can iterate your modifications quicker temp_counter = 0 for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset temp_counter += 1 if temp_counter < 200: path = os.path.join('..', path[:-1]) print path email = open(path, "r") ### use parseOutText to extract the text from the opened email email_spammed = parseOutText(email) ### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] for i in ["sara", "shackleton", "chris", "germani"]: email_spammed.replace(i, '') ### append the text to word_data word_data.append(email_spammed) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris from_data.append(0 if name == 'sara' else 1) email.close() print "emails processed"
### thousands of emails from Sara and Chris, so running over all of them ### can take a long time ### temp_counter helps you only look at the first 200 emails in the list so you ### can iterate your modifications quicker temp_counter = 0 for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: path = os.path.join('..', path[:-1]) print(path) email = open(path, "r") text = parseOutText(email) ### use parseOutText to extract the text from the opened email words_to_remove = ["sara", "shackleton", "chris", "germani", "sshacklensf", "cgermannsf"] for word in words_to_remove: text = text.replace(word, "") word_data.append(text)### append the text to word_data from_data.append(0 if name =="sara" else 1) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris email.close() print("emails processed") from_sara.close() from_chris.close()
### temp_counter helps you only look at the first 200 emails in the list so you ### can iterate your modifications quicker # temp_counter = 0 for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset # temp_counter += 1 # if temp_counter < 200: path = os.path.join('..', path[:-1]) # print path email = open(path, "r") ### use parseOutText to extract the text from the opened email parsed_email = str((parseOutText(email))) # print (parsed_email) ### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] stopwords = [ "sara", "shackleton", "chris", "germani", "sshacklensf", "cgermannsf" ] new = parsed_email for stopword in stopwords: if stopword in new: new = new.replace(stopword, '') word_data.append(new)
def dump_data(): """ Starter code to process the emails from Sara and Chris to extract the features and get the documents ready for classification. The list of all the emails from Sara are in the from_sara list likewise for emails from Chris (from_chris) The actual documents are in the Enron email dataset, which you downloaded/unpacked in Part 0 of the first mini-project. If you have not obtained the Enron email corpus, run startup.py in the tools folder. The data is stored in lists and packed away in pickle files at the end. """ from_sara = open("from_sara.txt", "r") from_chris = open("from_chris.txt", "r") from_data = [] word_data = [] ### temp_counter is a way to speed up the development--there are ### thousands of emails from Sara and Chris, so running over all of them ### can take a long time ### temp_counter helps you only look at the first 200 emails in the list so you ### can iterate your modifications quicker temp_counter = 0 for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset # temp_counter += 1 # if temp_counter < 200: path = os.path.join('..', path[:-1]) print path email = open(path, "r") ### use parseOutText to extract the text from the opened email parsed_out_text = parseOutText(email) ### use str.replace() to remove any instances of the words removewords = ["sara", "shackleton", "chris", "germani", "sshacklensf","cgermannsf"] for word in removewords: parsed_out_text = parsed_out_text.replace(word,"") ### append the text to word_data word_data.append(parsed_out_text) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris from_data.append(0 if name == "sara" else 1) email.close() print "emails processed" from_sara.close() from_chris.close() print "Answer to Lesson 10 Quiz, content of word_data[152]", word_data[152] pickle.dump( word_data, open("your_word_data.pkl", "w") ) pickle.dump( from_data, open("your_email_authors.pkl", "w") ) return word_data, from_data
### temp_counter helps you only look at the first 200 emails in the list temp_counter = 0 for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset temp_counter += 1 if temp_counter < 200: path = os.path.join('..', path[:-1]) print path email = open(path, "r") ### use parseOutText to extract the text from the opened email parsed_email = parseOutText(email) ### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] noise_words = ["sara", "shackleton", "chris", "germani", "sshacklensf", "cgermannsf"] for word in noise_words: parsed_email = parsed_email.replace(word,'') ### append the text to word_data word_data.append(parsed_email) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris from_data.append(0 if name == "sara" else 1) email.close()
temp_counter = 0 replaceWords = ["sara", "shackleton", "chris", "germani"] for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset temp_counter += 1 # if temp_counter < 101: path = os.path.join('..', path[:-1]) # print path email = open(path, "r") ### use parseOutText to extract the text from the opened email parsedEmail = parseOutText(email) ### use str.replace() to remove any instances of the words for word in replaceWords: parsedEmail = parsedEmail.replace(word, "") ### append the text to word_data word_data.append(parsedEmail) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris if(name == 'sara'): from_data.append(0) elif(name == 'chris'): from_data.append(1) email.close()
### can iterate your modifications quicker temp_counter = 0 for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset #temp_counter += 1 if temp_counter < 200: path = os.path.join('..', path[:-1]) print path email = open(path, "r") ### use parseOutText to extract the text from the opened email stemmed_email = parseOutText(email) ### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] remove = ["sara", "shackleton", "chris", "germani","sshacklensf","cgermannsf"] for word in remove: stemmed_email = stemmed_email.replace(word,'') ### append the text to word_data word_data.append(stemmed_email) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris if name == 'sara': from_data.append(0) if name == 'chris': from_data.append(1) email.close()
### temp_counter helps you only look at the first 200 emails in the list so you ### can iterate your modifications quicker temp_counter = 0 for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset #temp_counter += 1 if temp_counter < 200: path = './' + path[:-1] print path email = open(path, "r") ### use parseOutText to extract the text from the opened email texto = parseOutText(email) ### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] for word in ["sara", "shackleton", "chris", "germani"]: texto = texto.replace(word, '') ### append the text to word_data word_data.append(texto) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris if name == 'sara': from_data.append(0) else: from_data.append(1) email.close() print "emails processed" from_sara.close() from_chris.close()
### temp_counter helps you only look at the first 200 emails in the list so you ### can iterate your modifications quicker temp_counter = 0 for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset temp_counter += 1 if temp_counter < 200: path = os.path.join('..', path[:-1]) print path email = open(path, "r") ### use parseOutText to extract the text from the opened email email_temp = str(parseOutText(email)) ### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] rem = ["sara", "shackleton", "chris", "germani"] for word in rem: if (word in email_temp): email_temp = email_temp.replace(word, "") ### append the text to word_data word_data.append(email_temp) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris if name == "sara": from_data.append(0) else: from_data.append(1) email.close()
# temp_counter = 0 words_to_remove = ["sara", "shackleton", "chris", "germani", "sshacklensf", "cgermannsf"] for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset # temp_counter += 1 # if temp_counter < 200: path = os.path.join('..', path[:-1]) print path email = open(path, "r") ### use parseOutText to extract the text from the opened email email_contents = parseOutText(email) ### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] for w in words_to_remove: email_contents = email_contents.replace(w, "") ### append the text to word_data word_data.append(email_contents) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris if name == "sara": from_data.append(0) else: from_data.append(1)
### can iterate your modifications quicker #temp_counter = 0 for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset #temp_counter += 1 #if temp_counter < 200: path = os.path.join('..', path[:-1]) print path email = open(path, "r") ### use parseOutText to extract the text from the opened email body_message = parseOutText(email) # print "1: ", body_message ### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] replace_list = [ "sara", "shackleton", "chris", "germani", "sshacklensf", "cgermannsf" ] for word in replace_list: body_message = body_message.replace(word, "") # print "2: ", body_message ### append the text to word_data
### can iterate your modifications quicker temp_counter = 0 for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset # temp_counter += 1 if temp_counter < 200: path = os.path.join('..', path[:-1]) print path email = open(path, "r") ### use parseOutText to extract the text from the opened email text = parseOutText(email) ### use str.replace() to remove any instances of the words words = ["sara", "shackleton", "chris", "germani","sshacklensf","cgermannsf"] for word in words: text = text.replace(word, '') ### append the text to word_data word_data.append(text) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris print name if name=='sara': from_data.append(0) else: from_data.append(1)
pattern=re.compile('\s+') for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset temp_counter += 1 #if temp_counter < 200: if(True): path = os.path.join('..', path[:-1])#remove the '\n' print path email = open(path, "r") ### use parseOutText to extract the text from the opened email temp = parseOutText(email) ### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] temp=temp.replace("sara","") temp=temp.replace("shackleton","") temp=temp.replace("chris","") temp=temp.replace("germani","") temp=temp.replace("sshacklensf","") temp=temp.replace("cgermannsf","") temp=re.sub(pattern,' ',temp) ### append the text to word_data word_data.append(temp) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
### can take a long time ### temp_counter helps you only look at the first 200 emails in the list so you ### can iterate your modifications quicker counter=0 for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing #if counter<100: counter+=1 path = os.path.join('..', path[:-1]) print path email = open(path, "r") ### use parseOutText to extract the text from the opened email email_text=str(parseOutText(email)) #email_text=parseOutText(email) ### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] email_text=email_text.replace("sara", "").replace("shackleton", "").replace("chris", "").replace("germani", "").replace("sshacklensf", "").replace("cgermannsf", "") email_text=email_text.strip() ### append the text to word_data word_data.append(email_text) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris #print name=="sara" #print name from_data.append((0 if name=="sara" else 1)) email.close()
# temp_counter helps you only look at the first 200 emails in the list so you # can iterate your modifications quicker temp_counter = 0 for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: # only look at first 200 emails when developing # once everything is working, remove this line to run over full dataset # temp_counter += 1 # if temp_counter < 200: path = os.path.join('../../..', path[:-1]) print(path) email = open(path, "r") # use parseOutText to extract the text from the opened email text = parse_out_email_text.parseOutText(email) # use str.replace() to remove any instances of the words # ["sara", "shackleton", "chris", "germani"] # print(type(text)) text = text.replace('sara', '') text = text.replace('shackleton', '') text = text.replace('chris', '') text = text.replace('germani', '') # append the text to word_data word_data.append(text) # append a 0 to from_data if email is from Sara, and 1 if email is from Chris if name == 'sara': from_data.append(0) else: from_data.append(1)
### can iterate your modifications quicker temp_counter = 0 for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset temp_counter += 1 if temp_counter < 200: path = os.path.join('..', path[:-1]) print path email = open(path, "r") ### use parseOutText to extract the text from the opened email words = parseOutText(email) ### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] words.replace("sara", "").replace("shackleton", "").replace("chris", "").replace("germani", "") ### append the text to word_data word_data.append(words) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris from_data.append(0 if name == "sara" else 1) email.close() print "emails processed" from_sara.close() from_chris.close() pickle.dump( word_data, open("your_word_data.pkl", "w") )
### can iterate your modifications quicker temp_counter = 0 for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset temp_counter += 1 if temp_counter > 0: path = os.path.join("..", path[:-1]) print path email = open(path, "r") ### use parseOutText to extract the text from the opened email stemmedString = parseOutText(email) remove_words = ["sara", "shackleton", "chris", "germani"] for rw in remove_words: stemmedString = stemmedString.replace(rw, "") stemmedString = " ".join(stemmedString.split()) ### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] ### append the text to word_data word_data.append(stemmedString) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris if name == "sara":
### can take a long time ### temp_counter helps you only look at the first 200 emails in the list temp_counter = 0 from nltk.corpus import * sw = stopwords.words('english') for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset ##temp_counter += 1 ##if temp_counter < 200: path = os.path.join('..', path[:-1]) print path email = open(path, "r") content = parseOutText(email) #for i in ["sara", "shackleton", "chris", "germani"]: # text.replace(i, "") stop_words = ["sara", "shackleton", "chris", "germani", "sshacklensf", "cgermannsf"] for w in stop_words: content = content.replace(w, "") ### append the text to word_data word_data.append( content ) if name == "sara": from_data.append(0) else:
word_data = [] from_data = [] from parse_out_email_text import parseOutText for name, from_person in [("sara", from_sara), ("chris", from_chris)]: for path in from_person: ### only look at first 200 emails when developing ### once everything is working, remove this line to run over full dataset #temp_counter += 1 #if temp_counter < 200: origPath = path path = os.path.join('..', path[:-1]) print path email = open(path, "r") ### use parseOutText to extract the text from the opened email string1 = parseOutText(email) ### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] stringList = [ "sshacklensf", "cgermannsf", "sara", "shackleton", "chris", "germani" ] #"houectect", "houston", "houect", "fax", "smith", "1400", "forward", "germany", "street", "77002" for string in stringList: string1 = string1.replace(string, "") ### append the text to word_data word_data.append(string1) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris