示例#1
0
def get_word_data_from_email(email_addr, n_counter=10):
    "Get word data given email address"
    #     print email
    DIR = "../dataset/emails_by_address"
    try:
        f_emails = open("{dir}/from_{addr}.txt".format(addr=email_addr, dir=DIR), "r")
    except IOError:
        return "NaN"

    counter = 0
    word_data = []
    for path in f_emails:
        counter += 1
        if counter > n_counter:
            break
        path = "{dir}/".format(dir=DIR) + path[:-1]
        #         print path
        email = open(path, "r")

        ### use parseOutText to extract the text from the opened email
        words = parseOutText(email).replace("\n", "").replace("\r", "")

        word_data.append(words)

    document = " ".join(word_data)

    return document
def addFeatures(data_dict, features_list):

    emaillistings = os.listdir(emails_by_address_path)

    for person, features in data_dict.iteritems():

        mentions = 0
        
        eaddr = features['email_address']
        
        listing = "from_" + eaddr + ".txt"
                
        if listing in emaillistings:

            emailfiles = open(emails_by_address_path + '/' + listing, 'r')
            
            for emailfilepath in emailfiles:
                #emailfilepath = emailfilepath.replace('enron_mail_20110402/','../')
                
                email = open(emailfilepath.strip())
                
                words = parseOutText(email)
                                
                # contains email address or names
                mentions += num_mentions(words, poiIdentifiers(), person)
            
        print person + " : " + str(mentions)
        
        features['mentions'] = mentions
        
    features_list.append('mentions')                
    
    return (data_dict, features_list)    
def getSingleWordData(email):
	fn = "from_" + email + ".txt"
	words = ""
	try:
		from_file = open("./emails_by_address/" + fn, 'r')
    		for path in from_file:
			path = path[path.index('maildir'):]
       			path = os.path.join('..', path[:-1])
			email = open(path, 'r')
			words = parseOutText(email)
			email.close()
		from_file.close()
	except IOError:
		return []
	return [words]
def updateWordData(email,label,word_data,word_data_label):
	fn = "from_" + email + ".txt"
	try:
		from_file = open("./emails_by_address/" + fn, 'r')
    		for path in from_file:
			path = path[path.index('maildir'):]
       			path = os.path.join('..', path[:-1])
			email = open(path, 'r')
			words = parseOutText(email)
			word_data.append(words) 
			word_data_label.append(label)
			email.close()
		from_file.close()
	except IOError:
		pass
def get_emails(email_addr, sig_words, n_emails, to=True):
    '''get email word data for a person using email address.
    To get emails *to* this person, set to=True,
    To get emails *from* this person, set to=False.
    '''
    emails = []
    addr_dir = 'emails_by_address'
    pre = 'to_' if to else 'from_'
    filename = pre + email_addr + '.txt'
    path = addr_dir + '/' + filename
    with open(path, 'r') as f:
        temp_email_locs = f.readlines()
    # get lines that have valid email locations
    email_locs = []
    for loc in temp_email_locs:
        if 'maildir' in loc:
            maildir_idx = loc.index('maildir')
            # You may need to modify this line if you have emails in a
            # different location or with a different naming format
            new_loc = 'C:/enron/' + loc[maildir_idx:-1].replace('.','_')
            email_locs.append(new_loc)
    # if n_emails is specified as 'all', check all the emails
    # if there aren't n_emails to check, check as many as we can
    if n_emails == 'all' or len(email_locs) < n_emails:
        n_emails = len(email_locs)
    # shuffle the email list so we can randomly choose n_emails
    shuffle(email_locs)
    email_locs = email_locs[:n_emails]
    # get email contents
    for loc in email_locs:
        try:
            email = open(loc, 'r')
        except:
            # just going to skip to the next email if one won't open
            print "\nTried and failed to open %s" % loc
            continue
        text = parseOutText(email)
        # close the file after we got the text out of it
        email.close()
        # remove instances of signature words
        for sig_word in sig_words:
            text = text.replace(sig_word, '')
        emails.append(text)
    
    # return the parsed emails as one big string
    return ' '.join(emails)
def vectorize_emails(from_sara, from_chris, words_to_remove = [], max_emails = False):
    '''
    This function can be used independently. It takes as input the file pointers
    from_sara and from_chris, mandatory, and optionally:
    a list of words to remove (if not sent, it won't remove
            any words from the emails)
    and a max_emails (if parameter not sent, it uses all emails available,
            if max_emails is larger than the number of emails available,
            it uses all available emails)
    '''
    from_data = []
    word_data = []

    ### temp_counter is a way to speed up the development--there are
    ### thousands of emails from Sara and Chris, so running over all of them
    ### can take a long time
    ### temp_counter helps you only look at the first 200 emails in the list so you
    ### can iterate your modifications quicker
    for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
        temp_counter = 0
        for path in from_person:
            if max_emails and temp_counter == max_emails:
                break
            temp_counter += 1
            path = os.path.join('..', path[:-1])
            # print path
            email = open(path, "r")

            ### use parseOutText to extract the text from the opened email

            text = parseOutText(email)
            ### use str.replace() to remove any instances of the words
            for word in words_to_remove:
                text = text.replace(word, '')
            ### append the text to word_data
            word_data.append(text)
            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
            from_data.append(str(int(name == 'sara')))

            email.close()

    print "emails processed"
    from_sara.close()
    from_chris.close()

    return word_data, from_data
def extractTextData(email_list):
    from_data = []
    word_data = []
    emails_nf = []
    person_emails = {}
    temp_counter = 0

    for email_address, poi in email_list:
        try:
            from_person  = open(os.path.join(emails_by_address_path, "from_"+email_address+".txt"), "r")
            word_data_from_person = []
            for path in from_person:
                temp_counter += 1
                if (temp_counter % 5000) == 0:
                    print "{} parsed emails".format(temp_counter)

                path = os.path.join(mail_dir_path , path[:-1])

                ### using parseOutText to extract the text from the opened email
                email = open(path, "r")
                text = parseOutText(email)

                ### append the text to word_data
                word_data.append(text)
                word_data_from_person.append(text)
                ### append a 0 to from_data if email is not from POI, and 1 if email is from a POI
                from_data.append(poi)

                email.close()
            person_emails[email_address] = word_data_from_person
        except:
            emails_nf.append(email_address)

        from_person.close()

    print "all emails processed: %s total" %temp_counter
    print "%s emails not found" %len(emails_nf)
    pickle.dump( word_data, open("enron_word_data.pkl", "w") )
    pickle.dump( from_data, open("enron_email_authors.pkl", "w") )
    pickle.dump( person_emails, open("emails_from_person.pkl", "w") )
def get_word_data():
    for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
        for path in from_person:
            ### only look at first 200 emails when developing
            ### once everything is working, remove this line to run over full dataset
            #temp_counter += 1
            if temp_counter < 200:
                path = os.path.join('..', path[:-1])
                print path
                email = open(path, "r")

                ### use parseOutText to extract the text from the opened email
                email_parsed = parseOutText(email)

                ### use str.replace() to remove any instances of the words
                ### ["sara", "shackleton", "chris", "germani"]
                remove = ["sara", "shackleton", "chris", "germani", "sshacklensf", "cgermannsf"]
                for word in remove:
                    email_parsed = email_parsed.replace(word,"")

                ### append the text to word_data
                word_data.append(email_parsed)

                ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
                if name == "sara":
                    from_data.append(0)
                else:
                    from_data.append(1)

                email.close()

    print "emails processed"
    from_sara.close()
    from_chris.close()

    pickle.dump( word_data, open("your_word_data.pkl", "w") )
    pickle.dump( from_data, open("your_email_authors.pkl", "w") )

    return word_data
示例#9
0
#ler emails de cada colaborador que possui um endereco de email no dicionario
#nesse processo a funcao parseOutText ja retira os stopwords
print "Iniciando leitura dos emails"
for name in data_dict:
    try:
        from_person = open(
            "emails_by_address/from_" + data_dict[name]["email_address"] +
            ".txt", "r")
        temp_counter = 0
        words = ""
        for path in from_person:
            #por questao de tempo de processamento, so li 100 emails de cada pessoa
            if temp_counter < 100:
                path = os.path.join('..', path[20:len(path) - 1:])
                email = open(path, "r")
                w = parseOutText(email)
                words = words + w
                email.close()
            temp_counter += 1
        word_data.append(words)
        from_data.append(name)
        if data_dict[name]["poi"] == 1:
            p.append(1)
        else:
            p.append(0)
        from_person.close()
    except:
        continue
print "emails processados"

#transformar a lista de emails de cada colaborador em um vetor de palavras
示例#10
0
### temp_counter helps you only look at the first 200 emails in the list
temp_counter = 0


for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        temp_counter += 1
        #if temp_counter < 200:
        path = os.path.join('..', path[:-1])
        print path
        email = open(path, "r")

        ### use parseOutText to extract the text from the opened email
        email_parsed = parseOutText(email)
        ### use str.replace() to remove any instances of the words
        ### ["sara", "shackleton", "chris", "germani"]
        email_parsed = string.replace(email_parsed, "sara", "")
        email_parsed = string.replace(email_parsed, "shackleton", "")
        email_parsed = string.replace(email_parsed, "chris", "")
        email_parsed = string.replace(email_parsed, "germani", "")
        ### append the text to word_data
        word_data.append(email_parsed)
        ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
        from_data.append(0) if name == "sara" else from_data.append(1)


        email.close()

print "emails processed"
示例#11
0
### temp_counter helps you only look at the first 200 emails in the list
temp_counter = 0


for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        temp_counter += 1
        #if temp_counter < 20000000000:
        if temp_counter < 5:
            path = os.path.join('..', path[:-1])
            print path
            email = open(path, "r")
            
            emailText = parseOutText(email).replace("sara","").replace("shackleton","").replace("chris","").replace("germani","").replace("sshacklensf","").replace("cgermannsf","")    
            ### use parseOutText to extract the text from the opened email

            ### use str.replace() to remove any instances of the words
            ### ["sara", "shackleton", "chris", "germani"]

            ### append the text to word_data
            word_data.append(emailText)
            if name == 'sara':
                from_data.append(0)
            else:
                from_data.append(1)    
            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris


            email.close()
示例#12
0

for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        # temp_counter += 1
        if temp_counter < 200:
            path = os.path.join('..', path[:-1])
            # print path
            email = open(path, "r")



            ### use parseOutText to extract the text from the opened email
            parsed_email = parseOutText(email).encode('ascii')

            ### use str.replace() to remove any instances of the words
            ### ["sara", "shackleton", "chris", "germani"]
            for outcast in ["sara", "shackleton", "chris", "germani"]:
                parsed_email = parsed_email.replace(outcast, "")

            ### append the text to word_data
            word_data.append(parsed_email)

            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
            from_data.append((0 if name == "sara" else 1))

            email.close()

print "emails processed"
示例#13
0
### can take a long time
### temp_counter helps you only look at the first 200 emails in the list
temp_counter = 0
from nltk.corpus import *
sw = stopwords.words('english')

for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        ##temp_counter += 1
        ##if temp_counter < 200:
        path = os.path.join('..', path[:-1])
        print path
        email = open(path, "r")
        content = parseOutText(email)

        #for i in ["sara", "shackleton", "chris", "germani"]:
        #    text.replace(i, "")
        stop_words = [
            "sara", "shackleton", "chris", "germani", "sshacklensf",
            "cgermannsf"
        ]

        for w in stop_words:
            content = content.replace(w, "")

        ### append the text to word_data
        word_data.append(content)

        if name == "sara":
示例#14
0
### can iterate your modifications quicker
temp_counter = 0


for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        ### temp_counter += 1
        if temp_counter < 200:
            path = os.path.join('..', path[:-1])
            #print path
            email = open(path, "r")

            ### use parseOutText to extract the text from the opened email
            extract = parseOutText(email)
            ### use str.replace() to remove any instances of the words
            ### ["sara", "shackleton", "chris", "germani"]
            identifier_words = ["sara", "shackleton", "chris", "germani"]
            for i in identifier_words:
                extract = extract.replace(i, "")
            ### append the text to word_data
            word_data.append(extract)
            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
            if name == "sara":
                from_data.append(0)
            else:
                from_data.append(1)

            email.close()
示例#15
0
### can iterate your modifications quicker
temp_counter = 0


for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        # temp_counter += 1
        if temp_counter < 200:
            path = os.path.join('..', path[:-1])
            print path
            email = open(path, "r")

            ### use parseOutText to extract the text from the opened email
            stemmed_words = parseOutText(email)
            ### use str.replace() to remove any instances of the words
            ### ["sara", "shackleton", "chris", "germani"]
            for word in ["sara", "shackleton", "chris", "germani"]:
                stemmed_words = stemmed_words.replace(word, "")
            ### append the text to word_data
            word_data.append(stemmed_words)
            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
            if name == "sara":
                from_data.append(0)
            else:
                from_data.append(1)

            email.close()

print "emails processed"
print '[\033[91m LOADING\033[0m ] \033[94m\033[1mEmails are processing right now...\033[0m'


pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=17578).start()
for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        # only look at first 200 emails when developing
        # once everything is working, remove this line to run over full dataset
        # temp_counter += 1
        # if temp_counter < 200: #remove comment if you need 200
        path = os.path.join('..', path[:-1])
        # print '[\033[91m OK\033[0m ]' + path
        email = open(path, "r")

        # use parseOutText to extract the text from the opened email
        raw_txt = parseOutText(email)
        # use str.replace() to remove any instances of the words
        # ["sara", "shackleton", "chris", "germani"]
        unwanted_words = ["sara", "shackleton", "chris", "germani"]
        for word in unwanted_words:
            raw_txt = raw_txt.replace(word, "") 
        # append the text to word_data
        word_data.append(raw_txt)
        # append a 0 to from_data if email is from Sara, and 1 if email is from Chris
        from_data.append(0) if (name == "sara") else from_data.append(1)
        email.close()
        pbar.update(len(word_data))
pbar.finish()

print "[\033[92m OK\033[0m ] \033[94m\033[1mEmails Processed\033[0m"
from_sara.close()
word_data = []

### temp_counter - limit no. emails processed
#temp_counter = 0
for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        #temp_counter += 1
        #if temp_counter < 100:
        path = os.path.join('..', path[:-1])
        print(path)
        email = open(path, "r")

        ### use parseOutText to extract the text from the opened email
        stemmed_email = parseOutText(email)
        ### use str.replace() to remove any instances of the words
        words_to_replace = ["sara", "shackleton", "chris", "germani", "sshacklensf", "cgermannsf"]
        for word in words_to_replace:
            if (word in stemmed_email):
                stemmed_email = stemmed_email.replace(word, "")
        #stemmed_removed_email = " ".join([word for word in stemmed_email.split() if word not in words_to_replace])
        ### append the text to word_data

        word_data.append(stemmed_email)
        ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
        if name == "sara":
            from_data.append(0)
        else:
            from_data.append(1)
示例#18
0
### can iterate your modifications quicker
temp_counter = 0


for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        # temp_counter += 1
        # if temp_counter < 200:
        path = os.path.join('..', path[:-1])
        print path
        email = open(path, "r")

        ### use parseOutText to extract the text from the opened email
        parsedLine = parseOutText(email)
        ### use str.replace() to remove any instances of the words
        removeArr = ["sara", "shackleton", "chris", "germani"]
        for elem in removeArr:
            parsedLine = parsedLine.replace(elem, "")
        ### append the text to word_data
        word_data.append(parsedLine)
        ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
        if from_person == "sara":
            from_data.append(0)
        else:
            from_data.append(1)

        email.close()

print "emails processed"
示例#19
0
def dump_email_data(data_dict):

	directory = "emails_by_address/"
	counter = 0
	word_data = {}


	ls = poiEmails()
	for key in ls:
		email = ls[key]
		path1 = directory + "from_" + email + ".txt"
		path2 = directory + "to_" + email + ".txt"
		words = ""
		
		try:
			f1 = open(path1, "r")
			ls1 = f1.readlines()
			f1.close()
			for path in ls1:
				path = "../" + path[:-1]
				f2 = open(path, "r")
				words = words + " " + (parseOutText(f2))
				f2.close()

		except Exception:
			pass

		try:
			f1 = open(path2, "r")
			ls1 = f1.readlines()
			f1.close()
			for path in ls1:
				path = "../" + path[:-1]
				f2 = open(path, "r")
				words = words + " " + (parseOutText(f2))
				f2.close()

		except Exception:
			pass

		if words != "":
			if key in word_data:
				word_data[key] = word_data[key] + " " + words
			else:
				word_data[key] = words
			del words

	for key in data_dict:
		email = data_dict[key]['email_address']
		path1 = directory + "from_" + email + ".txt"
		path2 = directory + "to_" + email + ".txt"
		words = ""

		try:
			f1 = open(path1, "r")
			ls = f1.readlines()
			f1.close()
			for path in ls:
				path = "../" + path[:-1]
				f2 = open(path, "r")
				words = words + " " + (parseOutText(f2))
				f2.close()
		except Exception:
			pass

		try:
			f1 = open(path2, "r")
			ls = f1.readlines()
			f1.close()
			for path in l1:
				path = "../" + path[:-1]
				f2 = open(path, "r")
				words = words + " " + (parseOutText(f2))
				f2.close()
		except Exception:
			pass

		if words != "":
			if key in word_data:
				word_data[key] = word_data[key] + " " + words
			else:
				word_data[key] = words
			del words
		
		counter += 1
		print counter

	pickle_out = open("email_data.pkl", "wb")
	pickle.dump(word_data, pickle_out)
	pickle_out.close()

	print "\nSuccess!\nEmail Data Fetched."
temp_counter = 0

for name, from_person in [("sara", from_sara), ("chris", from_chris)]:

    for path in from_person:
        # only look at first 200 emails when developing
        # once everything is working, remove this line to run over full dataset
        path = os.path.join('../..', 'enron_dataset/' + path[:-1])
        # use parseOutText to extract the text from the opened email
        # use str.replace() to remove any instances of the words
        # ["sara", "shackleton", "chris", "germani"]
        # append the text to word_data
        # append a 0 to from_data if email is from Sara, and 1 if email is from Chris
        try:
            email = open(path, "r")
            text = parseOutText(
                email)  # parse every word of email with stemming
            text = str(text)
            text = text.replace('sara ', '')
            text = text.replace('shackleton ', '')
            text = text.replace('chris ', '')
            text = text.replace('germani ', '')
            word_data.append(text)
            if name == 'sara':
                from_data.append(0)
            else:
                from_data.append(1)
            email.close()
        except IOError:
            pass

print word_data[147]
### can take a long time
### temp_counter helps you only look at the first 200 emails in the list
temp_counter = 0

for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        #temp_counter += 1
        #if temp_counter < 200:
        path = os.path.join('..', path[:-1])
        print path
        email = open(path, "r")

        ### use parseOutText to extract the text from the opened email
        stemmed_text = parseOutText(email)
        ### use str.replace() to remove any instances of the words
        delete_words = [
            "sara", "shackleton", "chris", "germani", 'sshacklensf',
            'cgermannsf'
        ]
        for word in delete_words:
            stemmed_text = stemmed_text.replace(word, '').strip()
        ### append the text to word_data
        word_data.append(stemmed_text)

        ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
        if name == 'sara':
            from_data.append(0)
        else:
            from_data.append(1)
示例#22
0
def processMails(p):
    np.set_printoptions(threshold=np.nan)
    sys.path.append("../tools/")
    from parse_out_email_text import parseOutText
    """
        Starter code to process the emails from Sara and Chris to extract
        the features and get the documents ready for classification.

        The list of all the emails from Sara are in the from_sara list
        likewise for emails from Chris (from_chris)

        The actual documents are in the Enron email dataset, which
        you downloaded/unpacked in Part 0 of the first mini-project. If you have
        not obtained the Enron email corpus, run startup.py in the tools folder.

        The data is stored in lists and packed away in pickle files at the end.
    """

    f_list = open(p, "r")

    from_data = []
    word_data = []

    ### temp_counter is a way to speed up the development--there are
    ### thousands of emails from Sara and Chris, so running over all of them
    ### can take a long time
    ### temp_counter helps you only look at the first 200 emails in the list so you
    ### can iterate your modifications quicker
    temp_counter = 0
    n = 0

    for path in f_list:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset

        if temp_counter < 100:
            temp_counter += 1
            path = os.path.join('..', path[:-1])
            print path
            email = open(path, "r")

            ### use parseOutText to extract the text from the opened email
            parsed_email = parseOutText(email)

            ### use str.replace() to remove any instances of the words
            ### ["sara", "shackleton", "chris", "germani"]
            #words_to_remove = ["sara", "shackleton", "chris", "germani"]
            #print parsed_email
            #for word in words_to_remove:
            #    parsed_email = parsed_email.replace(word+' ', "")
            #print parsed_email
            ### append the text to word_data
            word_data.append(parsed_email)

            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris

            n += 1

            email.close()

            #print "Email not found"
    print n, "emails processed"
    f_list.close()
    return word_data
示例#23
0
### temp_counter helps you only look at the first 200 emails in the list so you
### can iterate your modifications quicker
temp_counter = 0

for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        #temp_counter += 1
        #if temp_counter < 200:
        path = os.path.join('..', path[:-1])
        email = open(path, "r")

        ### use parseOutText to extract the text from the opened email

        email_text = parseOutText(email)
        ### use str.replace() to remove any instances of the words
        remove = ["sara", "shackleton", "chris", "germani"]

        for word in remove:
            email_text = email_text.replace(word, "")

### append the text to word_data
        word_data.append(email_text)
        ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
        from_data.append(0 if name == "sara" else 1)

        email.close()

print "emails processed"
from_sara.close()
### thousands of emails from Sara and Chris, so running over all of them
### can take a long time
### temp_counter helps you only look at the first 200 emails in the list
temp_counter = 0

for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        temp_counter += 1
        #if temp_counter < 200:
        path = os.path.join('..', path[:-1])
        print "In: ", path
        email = open(path, "r")

        words_stem = str(parseOutText(email))
            ### use str.replace() to remove any instances of the words
            ### ["sara", "shackleton", "chris", "germani"]
        for word in sw:
            if(word in words_stem):
                words_stem = words_stem.replace(word, "")
        word_data.append(words_stem)### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
        if name is "sara":
            from_data.append("0")
        elif name is "chris":
            from_data.append("1")
        email.close()

print "All emails processed"
from_sara.close()
from_chris.close()
示例#25
0
### temp_counter helps you only look at the first 200 emails in the list so you
### can iterate your modifications quicker
temp_counter = 0

for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        temp_counter += 1
        if temp_counter > 0:
            path = os.path.join('..', path[:-1])
            print path
            email = open(path, "r")

            ### use parseOutText to extract the text from the opened email
            extract_text = parseOutText(email)
            ### use str.replace() to remove any instances of the words
            ### ["sara", "shackleton", "chris", "germani"]
            for w in [
                    "sara", "shackleton", "chris", "germani", "sshacklensf",
                    "cgermannsf"
            ]:
                extract_text = extract_text.replace(w, '')
            ### append the text to word_data
            word_data.append(extract_text)
            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
            if name == 'sara':
                from_data.append(0)
            else:
                from_data.append(1)
示例#26
0
### temp_counter helps you only look at the first 200 emails in the list so you
### can iterate your modifications quicker
temp_counter = 0

for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        temp_counter += 1
        if temp_counter < 200:
            path = os.path.join('..', path[:-1])
            print path
            email = open(path, "r")

            ### use parseOutText to extract the text from the opened email
            email_body = parseOutText(email)
            ### use str.replace() to remove any instances of the words
            ### ["sara", "shackleton", "chris", "germani"]
            signature_words = ["sara", "shackleton", "chris", "germani","sshacklensf","cgermannsf"]

            ### append the text to word_data
            for word in signature_words:
                if word in email_body:
                   email_body=email_body.replace(word,"")
            word_data.append(email_body)
            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
            if from_person == from_sara :
               from_data.append(0)
            else:
               from_data.append(1)
示例#27
0
### can take a long time
### temp_counter helps you only look at the first 200 emails in the list so you
### can iterate your modifications quicker
temp_counter = 0


for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        path = os.path.join('..', path[:-1])
        print (path)
        email = open(path, "r")

        ### use parseOutText to extract the text from the opened email
        parsed_email = parseOutText(email)

        ### use str.replace() to remove any instances of the words
        ### ["sara", "shackleton", "chris", "germani"]

        parsed_email.replace("sara", "")
        parsed_email.replace("shackleton", "")
        parsed_email.replace("chris", "")
        parsed_email.replace("germani", "")
        word_data.append(parsed_email)

        if from_person == "sara":
            from_data.append(0)
        elif from_person == "chris":
            from_data.append(1)
示例#28
0
        ###temp_counter += 1
        ###if temp_counter < 200:
        path = os.path.join('..', path[:-1])
        print path
        email = open(path, "r")

        ### use parseOutText to extract the text from the opened email

        ### use str.replace() to remove any instances of the words
        ### ["sara", "shackleton", "chris", "germani"]

        ### append the text to word_data

        ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris

        test = parseOutText(email)

        test = test.replace("sara", "").replace("shackleton", "").replace("chris", "").replace("germani", "")
        test = test.replace("sshacklensf", "").replace("cgermannsf", "")

        word_data.append(test)
        from_data.append(0 if name == "sara" else 1)

        email.close()

print "emails processed"
from_sara.close()
from_chris.close()


pickle.dump( word_data, open("your_word_data.pkl", "w") )
### can iterate your modifications quicker
temp_counter = 0

for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        temp_counter += 1
        if temp_counter < 500000:
            path = os.path.join('..', path[:-1])
            print path
            try:
                email = open(path, "r")

                ### use parseOutText to extract the text from the opened email
                parsedText = parseOutText(email)

                ### use str.replace() to remove any instances of the words
                ### ["sara", "shackleton", "chris", "germani, sshacklensf, cgermannsf"]
                parsedText = parsedText.replace("sara", "")
                parsedText = parsedText.replace("shackleton", "")
                parsedText = parsedText.replace("chris", "")
                parsedText = parsedText.replace("germani", "")
                parsedText = parsedText.replace("sshacklensf", "")
                parsedText = parsedText.replace("cgermannsf", "")
                ### append the text to word_data

                word_data.append(parsedText)

                ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
                if name == 'sara':
### can iterate your modifications quicker
temp_counter = 0


for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        temp_counter += 1
        # if temp_counter < 200:
        path = os.path.join('..', path[:-1])
        email = open(path, "r")

        ### use parseOutText to extract the text from the opened email
        import parse_out_email_text
        stememail = parse_out_email_text.parseOutText(email)
        # print stememail
        ### use str.replace() to remove any instances of the words
        ### ["sara", "shackleton", "chris", "germani"]
        removeWrods = ["sara", "shackleton", "chris", "germani", "sshacklensf", "cgermannsf"]
        for rw in removeWrods:
            stememail = stememail.replace(rw,'')
        # print stememail
        ### append the text to word_data
        word_data.append(stememail)

        ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
        if name=='sara':
            from_data.append(0)
        else:
            from_data.append(1)
### thousands of emails from Sara and Chris, so running over all of them
### can take a long time
### temp_counter helps you only look at the first 200 emails in the list so you
### can iterate your modifications quicker

for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        path = os.path.join('../../enron_mail_20150507/enron_mail_20150507',
                            path[:-1])
        print path
        email = open(path, "r")

        ### use parseOutText to extract the text from the opened email
        words = parseOutText(email)

        ### use str.replace() to remove any instances of the words
        ### ["sara", "shackleton", "chris", "germani"]
        words = words.replace("sara", "")
        words = words.replace("shackleton", "")
        words = words.replace("chris", "")
        words = words.replace("germani", "")
        words = words.replace("sshacklensf", "")
        words = words.replace("cgermannsf", "")
        ### append the text to word_data
        word_data.append(words)

        ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
        if name == "sara":
            from_data.append(0)
示例#32
0
### can iterate your modifications quicker
temp_counter = 0


for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        # temp_counter += 1
        if temp_counter < 10000:
            path = os.path.join('..', path[:-1])
            print path
            email = open(path, "r")

            ### use parseOutText to extract the text from the opened email
            string = parseOutText(email)

            ### use str.replace() to remove any instances of the words
            ### ["sara", "shackleton", "chris", "germani"]
            for i in ["sara", "shackleton", "chris", "germani", 'sshacklensf', 'cgermannsf']:
                string = string.replace(i, '')

            # string = string.replace('\r\n', ' ')
            # string = string.replace('  ', ' ')
            ### append the text to word_data
            word_data.append(string)

            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
            if name == "sara":
                from_data.append(0)
            else:
示例#33
0
### temp_counter helps you only look at the first 200 emails in the list so you
### can iterate your modifications quicker
temp_counter = 0

for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        temp_counter += 1
        if temp_counter < 200:
            path = os.path.join('..', path[:-1])
            print path
            email = open(path, "r")

            ### use parseOutText to extract the text from the opened email
            email_spammed = parseOutText(email)

            ### use str.replace() to remove any instances of the words
            ### ["sara", "shackleton", "chris", "germani"]
            for i in ["sara", "shackleton", "chris", "germani"]:
                email_spammed.replace(i, '')

            ### append the text to word_data
            word_data.append(email_spammed)

            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
            from_data.append(0 if name == 'sara' else 1)

            email.close()

print "emails processed"
### thousands of emails from Sara and Chris, so running over all of them
### can take a long time
### temp_counter helps you only look at the first 200 emails in the list so you
### can iterate your modifications quicker
temp_counter = 0


for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:


        path = os.path.join('..', path[:-1])
        print(path)
        email = open(path, "r")

        text = parseOutText(email) ### use parseOutText to extract the text from the opened email

        words_to_remove = ["sara", "shackleton", "chris",
                           "germani", "sshacklensf", "cgermannsf"]
        for word in words_to_remove:
            text = text.replace(word, "")

        word_data.append(text)### append the text to word_data
        from_data.append(0 if name =="sara" else 1)
        ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris

        email.close()

print("emails processed")
from_sara.close()
from_chris.close()
示例#35
0
### temp_counter helps you only look at the first 200 emails in the list so you
### can iterate your modifications quicker
# temp_counter = 0

for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        # temp_counter += 1
        # if temp_counter < 200:
        path = os.path.join('..', path[:-1])
        # print path
        email = open(path, "r")

        ### use parseOutText to extract the text from the opened email
        parsed_email = str((parseOutText(email)))

        # print (parsed_email)

        ### use str.replace() to remove any instances of the words
        ### ["sara", "shackleton", "chris", "germani"]
        stopwords = [
            "sara", "shackleton", "chris", "germani", "sshacklensf",
            "cgermannsf"
        ]
        new = parsed_email
        for stopword in stopwords:
            if stopword in new:
                new = new.replace(stopword, '')
        word_data.append(new)
def dump_data():
	"""
		Starter code to process the emails from Sara and Chris to extract
		the features and get the documents ready for classification.

		The list of all the emails from Sara are in the from_sara list
		likewise for emails from Chris (from_chris)

		The actual documents are in the Enron email dataset, which
		you downloaded/unpacked in Part 0 of the first mini-project. If you have
		not obtained the Enron email corpus, run startup.py in the tools folder.

		The data is stored in lists and packed away in pickle files at the end.
	"""


	from_sara  = open("from_sara.txt", "r")
	from_chris = open("from_chris.txt", "r")

	from_data = []
	word_data = []

	### temp_counter is a way to speed up the development--there are
	### thousands of emails from Sara and Chris, so running over all of them
	### can take a long time
	### temp_counter helps you only look at the first 200 emails in the list so you
	### can iterate your modifications quicker
	temp_counter = 0


	for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
		for path in from_person:
			### only look at first 200 emails when developing
			### once everything is working, remove this line to run over full dataset
			# temp_counter += 1
			# if temp_counter < 200:
			path = os.path.join('..', path[:-1])
			print path
			email = open(path, "r")

			### use parseOutText to extract the text from the opened email
			parsed_out_text = parseOutText(email)

			### use str.replace() to remove any instances of the words
			removewords = ["sara", "shackleton", "chris", "germani", "sshacklensf","cgermannsf"]
			for word in removewords:
				parsed_out_text = parsed_out_text.replace(word,"")
				
			### append the text to word_data
			word_data.append(parsed_out_text)
			### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
			from_data.append(0 if name == "sara" else 1) 
			email.close()

	print "emails processed"
	from_sara.close()
	from_chris.close()
	print "Answer to Lesson 10 Quiz, content of word_data[152]", word_data[152]
	pickle.dump( word_data, open("your_word_data.pkl", "w") )
	pickle.dump( from_data, open("your_email_authors.pkl", "w") )
	return word_data, from_data
### temp_counter helps you only look at the first 200 emails in the list
temp_counter = 0


for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        temp_counter += 1
        if temp_counter < 200:
            path = os.path.join('..', path[:-1])
            print path
            email = open(path, "r")

            ### use parseOutText to extract the text from the opened email
            parsed_email = parseOutText(email)
            ### use str.replace() to remove any instances of the words
            ### ["sara", "shackleton", "chris", "germani"]
            
            noise_words = ["sara", "shackleton", "chris", "germani", "sshacklensf", "cgermannsf"]
            for word in noise_words:
                parsed_email = parsed_email.replace(word,'')
            
            ### append the text to word_data
            word_data.append(parsed_email)
            
            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
            from_data.append(0 if name == "sara" else 1)

            email.close()
temp_counter = 0
replaceWords = ["sara", "shackleton", "chris", "germani"]


for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        temp_counter += 1
#        if temp_counter < 101:
        path = os.path.join('..', path[:-1])
#            print path
        email = open(path, "r")

        ### use parseOutText to extract the text from the opened email
        parsedEmail = parseOutText(email)

        ### use str.replace() to remove any instances of the words
        for word in replaceWords:
            parsedEmail = parsedEmail.replace(word, "")

        ### append the text to word_data
        word_data.append(parsedEmail)

        ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
        if(name == 'sara'):
            from_data.append(0)
        elif(name == 'chris'):
            from_data.append(1)

        email.close()
### can iterate your modifications quicker
temp_counter = 0


for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        #temp_counter += 1
        if temp_counter < 200:
            path = os.path.join('..', path[:-1])
            print path
            email = open(path, "r")
            
            ### use parseOutText to extract the text from the opened email
            stemmed_email = parseOutText(email)
            ### use str.replace() to remove any instances of the words
            ### ["sara", "shackleton", "chris", "germani"]
            remove = ["sara", "shackleton", "chris", "germani","sshacklensf","cgermannsf"]
            for word in remove:           
                stemmed_email = stemmed_email.replace(word,'')
            ### append the text to word_data
            word_data.append(stemmed_email)
            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
            if name == 'sara':
                from_data.append(0)
            if name == 'chris':
                from_data.append(1)
                
            email.close()
示例#40
0
### temp_counter helps you only look at the first 200 emails in the list so you
### can iterate your modifications quicker
temp_counter = 0

for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        #temp_counter += 1
        if temp_counter < 200:
            path = './' + path[:-1]
            print path
            email = open(path, "r")

            ### use parseOutText to extract the text from the opened email
            texto = parseOutText(email)
            ### use str.replace() to remove any instances of the words
            ### ["sara", "shackleton", "chris", "germani"]
            for word in ["sara", "shackleton", "chris", "germani"]:
                texto = texto.replace(word, '')
            ### append the text to word_data
            word_data.append(texto)
            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
            if name == 'sara': from_data.append(0)
            else: from_data.append(1)

            email.close()

print "emails processed"
from_sara.close()
from_chris.close()
### temp_counter helps you only look at the first 200 emails in the list so you
### can iterate your modifications quicker
temp_counter = 0

for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        temp_counter += 1
        if temp_counter < 200:
            path = os.path.join('..', path[:-1])
            print path
            email = open(path, "r")

            ### use parseOutText to extract the text from the opened email
        email_temp = str(parseOutText(email))
        ### use str.replace() to remove any instances of the words
        ### ["sara", "shackleton", "chris", "germani"]
        rem = ["sara", "shackleton", "chris", "germani"]
        for word in rem:
            if (word in email_temp):
                email_temp = email_temp.replace(word, "")
            ### append the text to word_data
        word_data.append(email_temp)
        ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
        if name == "sara":
            from_data.append(0)
        else:
            from_data.append(1)

            email.close()
# temp_counter = 0

words_to_remove = ["sara", "shackleton", "chris", "germani", "sshacklensf", "cgermannsf"]

for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        # temp_counter += 1
        # if temp_counter < 200:
        path = os.path.join('..', path[:-1])
        print path
        email = open(path, "r")

        ### use parseOutText to extract the text from the opened email
        email_contents = parseOutText(email)

        ### use str.replace() to remove any instances of the words
        ### ["sara", "shackleton", "chris", "germani"]
        for w in words_to_remove:
            email_contents = email_contents.replace(w, "")

        ### append the text to word_data
        word_data.append(email_contents)

        ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris

        if name == "sara":
            from_data.append(0)
        else:
            from_data.append(1)
示例#43
0
### can iterate your modifications quicker
#temp_counter = 0

for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        #temp_counter += 1
        #if temp_counter < 200:

        path = os.path.join('..', path[:-1])
        print path
        email = open(path, "r")

        ### use parseOutText to extract the text from the opened email
        body_message = parseOutText(email)

        # print "1: ", body_message

        ### use str.replace() to remove any instances of the words
        ### ["sara", "shackleton", "chris", "germani"]
        replace_list = [
            "sara", "shackleton", "chris", "germani", "sshacklensf",
            "cgermannsf"
        ]
        for word in replace_list:
            body_message = body_message.replace(word, "")

        # print "2: ", body_message

        ### append the text to word_data
### can iterate your modifications quicker
temp_counter = 0


for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        # temp_counter += 1
        if temp_counter < 200:
            path = os.path.join('..', path[:-1])
            print path
            email = open(path, "r")

            ### use parseOutText to extract the text from the opened email
            text = parseOutText(email)
            ### use str.replace() to remove any instances of the words
            words = ["sara", "shackleton", "chris", "germani","sshacklensf","cgermannsf"]
            for word in words:
                text = text.replace(word, '')


            ### append the text to word_data
            word_data.append(text)
            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
            print name
            if name=='sara':
                from_data.append(0)
            else:
                from_data.append(1)
示例#45
0
pattern=re.compile('\s+')


for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        temp_counter += 1
        #if temp_counter < 200:
        if(True):
            path = os.path.join('..', path[:-1])#remove the '\n'
            print path
            email = open(path, "r")

            ### use parseOutText to extract the text from the opened email
            temp = parseOutText(email)
            ### use str.replace() to remove any instances of the words
            ### ["sara", "shackleton", "chris", "germani"]
            temp=temp.replace("sara","")
            temp=temp.replace("shackleton","")
            temp=temp.replace("chris","")
            temp=temp.replace("germani","")
            temp=temp.replace("sshacklensf","")
            temp=temp.replace("cgermannsf","")
            temp=re.sub(pattern,' ',temp)
            

            ### append the text to word_data
            word_data.append(temp)

            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
示例#46
0
### can take a long time
### temp_counter helps you only look at the first 200 emails in the list so you
### can iterate your modifications quicker

counter=0
for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        #if counter<100:
            counter+=1
            path = os.path.join('..', path[:-1])
            print path
            email = open(path, "r")

            ### use parseOutText to extract the text from the opened email
            email_text=str(parseOutText(email))
            #email_text=parseOutText(email)

            ### use str.replace() to remove any instances of the words
            ### ["sara", "shackleton", "chris", "germani"]
            email_text=email_text.replace("sara", "").replace("shackleton", "").replace("chris", "").replace("germani", "").replace("sshacklensf", "").replace("cgermannsf", "")

            email_text=email_text.strip()
            ### append the text to word_data
            word_data.append(email_text)
            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
            #print name=="sara"
            #print name
            from_data.append((0 if name=="sara" else 1))
            email.close()
示例#47
0
# temp_counter helps you only look at the first 200 emails in the list so you
# can iterate your modifications quicker
temp_counter = 0

for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        # only look at first 200 emails when developing
        # once everything is working, remove this line to run over full dataset
        # temp_counter += 1
        # if temp_counter < 200:
        path = os.path.join('../../..', path[:-1])
        print(path)
        email = open(path, "r")

        # use parseOutText to extract the text from the opened email
        text = parse_out_email_text.parseOutText(email)
        # use str.replace() to remove any instances of the words
        # ["sara", "shackleton", "chris", "germani"]

        # print(type(text))
        text = text.replace('sara', '')
        text = text.replace('shackleton', '')
        text = text.replace('chris', '')
        text = text.replace('germani', '')
        # append the text to word_data
        word_data.append(text)
        # append a 0 to from_data if email is from Sara, and 1 if email is from Chris
        if name == 'sara':
            from_data.append(0)
        else:
            from_data.append(1)
示例#48
0
### can iterate your modifications quicker
temp_counter = 0


for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        temp_counter += 1
        if temp_counter < 200:
            path = os.path.join('..', path[:-1])
            print path
            email = open(path, "r")

            ### use parseOutText to extract the text from the opened email
            words = parseOutText(email)
            ### use str.replace() to remove any instances of the words
            ### ["sara", "shackleton", "chris", "germani"]
            words.replace("sara", "").replace("shackleton", "").replace("chris", "").replace("germani", "")
            ### append the text to word_data
            word_data.append(words)

            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
            from_data.append(0 if name == "sara" else 1)
            email.close()

print "emails processed"
from_sara.close()
from_chris.close()

pickle.dump( word_data, open("your_word_data.pkl", "w") )
示例#49
0
### can iterate your modifications quicker
temp_counter = 0


for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        temp_counter += 1
        if temp_counter > 0:
            path = os.path.join("..", path[:-1])
            print path
            email = open(path, "r")

            ### use parseOutText to extract the text from the opened email
            stemmedString = parseOutText(email)
            remove_words = ["sara", "shackleton", "chris", "germani"]
            for rw in remove_words:
                stemmedString = stemmedString.replace(rw, "")

            stemmedString = " ".join(stemmedString.split())

            ### use str.replace() to remove any instances of the words
            ### ["sara", "shackleton", "chris", "germani"]

            ### append the text to word_data
            word_data.append(stemmedString)

            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris

            if name == "sara":
### can take a long time
### temp_counter helps you only look at the first 200 emails in the list
temp_counter = 0
from nltk.corpus import *
sw = stopwords.words('english')

for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        ##temp_counter += 1
        ##if temp_counter < 200:
            path = os.path.join('..', path[:-1])
            print path
            email = open(path, "r")
            content = parseOutText(email)


            #for i in ["sara", "shackleton", "chris", "germani"]:
            #    text.replace(i, "")
            stop_words = ["sara", "shackleton", "chris", "germani", "sshacklensf", "cgermannsf"]

            for w in stop_words:
                content = content.replace(w, "")

            ### append the text to word_data
            word_data.append( content )

            if name == "sara":
                from_data.append(0)
            else:
示例#51
0
word_data = []
from_data = []
from parse_out_email_text import parseOutText
for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
    for path in from_person:
        ### only look at first 200 emails when developing
        ### once everything is working, remove this line to run over full dataset
        #temp_counter += 1
        #if temp_counter < 200:
        origPath = path
        path = os.path.join('..', path[:-1])
        print path
        email = open(path, "r")

        ### use parseOutText to extract the text from the opened email
        string1 = parseOutText(email)

        ### use str.replace() to remove any instances of the words
        ### ["sara", "shackleton", "chris", "germani"]
        stringList = [
            "sshacklensf", "cgermannsf", "sara", "shackleton", "chris",
            "germani"
        ]
        #"houectect", "houston", "houect", "fax", "smith", "1400", "forward", "germany", "street", "77002"
        for string in stringList:
            string1 = string1.replace(string, "")

    ### append the text to word_data
        word_data.append(string1)

        ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris