/
tfidf.py
106 lines (79 loc) · 3.89 KB
/
tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from pyspark import SparkContext
import json
import time
import math
from operator import add
from difflib import get_close_matches
from term_tools import get_terms
print 'loading'
sc = SparkContext("spark://ec2-107-22-0-110.compute-1.amazonaws.com:7077", "TF-IDF", pyFiles=['term_tools.py'])
# Returns a list of dicts => each dict containing a sender and a term
def sender_term_pairs(email):
sender = email['sender']
master_email = sender
if sender in email_to_master.value:
master_email = email_to_master.value[sender]
return map(lambda x: {'sender': master_email, 'term': x}, get_terms(email['text']))
# [{'term': u'talk', 'sender': u'rosalee.fleming@enron.com'}, {'term': u'talk', 'sender': u'rosalee.fleming@enron.com'}, {'term': u'talk', 'sender': u'rosalee.fleming@enron.com'}]
#
# Given a list like the one above, creates a dictionary of the frequency of the term by the sender
def single_sender_term_freq(sender_list):
sender_tf = {}
for pair in sender_list:
key = pair['sender']
if (key in sender_tf):
sender_tf[key] += 1
else:
sender_tf[key] = 1
return sender_tf
# Given a tuple of a term and a list of individual occurences by different senders, calculates the term frequency per sender
def sender_tf(grouped_pair):
tf_dict = single_sender_term_freq(grouped_pair[1])
term = grouped_pair[0]
return map(lambda y: (term, y), tf_dict.items())
def consolidate_emails(grouped_pair):
letter = grouped_pair[0]
email_pairs = grouped_pair[1]
all_names = [x[1] for x in email_pairs]
name_email_dict = {}
for x in email_pairs:
name_email_dict[x[1]] = x[0]
consolidated_emails = []
for pair in email_pairs:
email = pair[0]
name = pair[1]
close_matches = get_close_matches(name, all_names, cutoff=0.8)
matching_emails = []
for x in close_matches:
matching_emails.append(name_email_dict[x])
consolidated_emails.append((email, set(matching_emails)))
return map(lambda y: y, consolidated_emails)
#---- BEGIN PROCESSING ------#
corpus = sc.textFile('s3n://AKIAJFDTPC4XX2LVETGA:lJPMR8IqPw2rsVKmsSgniUd+cLhpItI42Z6DCFku@6885public/enron/*.json')
#corpus = sc.textFile('s3n://AKIAJFDTPC4XX2LVETGA:lJPMR8IqPw2rsVKmsSgniUd+cLhpItI42Z6DCFku@6885public/enron/lay-k.json')
#corpus = sc.textFile('s3n://AKIAJFDTPC4XX2LVETGA:lJPMR8IqPw2rsVKmsSgniUd+cLhpItI42Z6DCFku@6885public/fsosa/short.json')
json_corpus = corpus.map(lambda x: json.loads(x)).cache()
#--- Disambiguation ---#
# Approach:
# - Get a list of distinct emails from the data set
# - Sort them into groups based on the first letter of the name part of the email (i.e. before '@')
# - Create a dictionary of email to master where master is the email that we will consolidate our searches over
# e.g ken.lay@enron.com and ken.lay@yahoo.com have as their master email ken.lay@enron.com
# - Broadcast to all nodes so that they know about it
unique_emails = json_corpus.map(lambda x: x['sender']).distinct()
lastnames = unique_emails.map(lambda x: (x,x.split("@")[0])).groupBy(lambda x: x[1][0], 500)
consolidated = lastnames.flatMap(consolidate_emails).flatMap(lambda x: map(lambda y: (y, x[0]), x[1])).collectAsMap()
email_to_master = sc.broadcast(consolidated)
#----- Actual TF-IDF Calculation -----#
# Calculate per-term idf
term_counts = json_corpus.flatMap(lambda x: get_terms(x['text'])).map(lambda y: (y, 1)).reduceByKey(add)
per_term_idf = term_counts.map(lambda x: (x[0], math.log(516893.0 / x[1]))).cache()
# Get sender/term pairs
grouped_sender_term_pairs = json_corpus.flatMap(sender_term_pairs).groupBy(lambda x: x['term'], 500)
# Calculate sender-term frequency
sender_tf = grouped_sender_term_pairs.flatMap(sender_tf).cache()
#e.g. join: (u'talk', ((u'rosalee.fleming@enron.com', 3), 12.056978880153091))
tfidf = sender_tf.join(per_term_idf, 500).map(lambda x:{'sender': x[1][0][0], 'term':x[0], 'tf-idf':x[1][0][1]*x[1][1]})
output = tfidf.collect()
for x in output:
print x