def root_to_json(root_dir, output_file): walker = EmailWalker(root_dir) output = open(output_file, "w") for email in walker: email['date'] = str(email['date']) line = JSONValueProtocol.write(None, email) + '\n' output.write(line) output.close()
def walk_emails(folder): sender_tf = defaultdict(Counter) sender_counts = Counter() for e in EmailWalker(folder): # split by lines and filter out quotations text = ' '.join( filter(lambda line: not re.search('^\s+>+', line), e['text'].split('\n'))) terms_in_email = reasonable_words( text.split()) # split the email text using whitespaces terms_in_email = group_in_grams(2, terms_in_email) sender_tf[e['sender']].update(terms_in_email) sender_counts[e['sender']] += 1 # Count all the senders that every term was sent by allterms = Counter() for sender_terms in sender_tf.itervalues(): allterms.update(sender_terms.keys()) # From the counts, compute the IDFs idfs = {} nsenders = len(sender_tf) # num of keys is num of senders for term, sender_count in allterms.iteritems(): idfs[term] = math.log(nsenders / (1.0 + sender_count)) # Filter down to most frequent senders top_senders = sorted(sender_counts.items(), key=itemgetter(1), reverse=True)[:20] # Combine TFs and IDFs to make TFIDFs tfidfs = defaultdict(list) for sender, count in top_senders: for term, tf in sender_tf[sender].iteritems(): tfidfs[sender].append((term, tf * idfs[term])) # Display for sender, count in top_senders: print sender, count sorted_by_count_top20 = sorted(tfidfs[sender], key=itemgetter(1), reverse=True)[:20] for pair in sorted_by_count_top20: print '\t', pair
def walk_emails(folder): folder_tf = defaultdict(Counter) terms_per_folder = defaultdict(set) for e in EmailWalker(folder): terms_in_email = reasonable_words( e['text'].split()) # split the email text using whitespaces folder_tf[e['folder']].update(terms_in_email) # For the IDF, this collects all of the terms in each folder terms_per_folder[e['folder']].update(terms_in_email) # Count all the folders that every term appears in allterms = Counter() for terms in terms_per_folder.itervalues(): allterms.update(terms) # From the counts, compute the IDFs idfs = {} nfolders = len(terms_per_folder) # num of keys is num of folders for term, folder_count in allterms.iteritems(): idfs[term] = math.log(nfolders / (1.0 + folder_count)) # Combine TFs and IDFs to make TFIDFs tfidfs = defaultdict(dict) for folder, terms in folder_tf.iteritems(): for term, tf in terms.iteritems(): tfidfs[folder][term] = (tf * idfs[term]) # Display seen_folders = set() similarities = {} for folder, scores in tfidfs.items(): seen_folders.add(folder) for folder2, scores2 in tfidfs.items(): if folder2 not in seen_folders: similarities[(folder, folder2)] = cos_sim(scores, scores2) sorted_by_sim_top_20 = sorted(similarities.items(), key=itemgetter(1), reverse=True)[:20] for folder_pair, similarity in sorted_by_sim_top_20: print folder_pair, similarity
def walk_emails(folder): folder_tf = defaultdict(Counter) terms_per_folder = defaultdict(set) for e in EmailWalker(folder): terms_in_email = reasonable_words( e['text'].split()) # split the email text using whitespaces folder_tf[e['folder']].update(terms_in_email) # For the IDF, this collects all of the terms in each folder terms_per_folder[e['folder']].update(terms_in_email) # Count all the folders that every term appears in allterms = Counter() for terms in terms_per_folder.itervalues(): allterms.update(terms) # From the counts, compute the IDFs idfs = {} nfolders = len(terms_per_folder) # num of keys is num of folders for term, folder_count in allterms.iteritems(): idfs[term] = math.log(nfolders / (1.0 + folder_count)) # Combine TFs and IDFs to make TFIDFs tfidfs = defaultdict(list) for folder, terms in folder_tf.iteritems(): for term, tf in terms.iteritems(): tfidfs[folder].append((term, tf * idfs[term])) # Display for folder, scores in tfidfs.items(): print folder sorted_by_count_top20 = sorted(scores, key=itemgetter(1), reverse=True)[:20] for pair in sorted_by_count_top20: print '\t', pair
def loadEnronEmails(folder=''): clearSubKeyspace(db, 'ER') reader = EmailWalker(localdir + folder) tasks = BoundedBuffer(reader, addEmail) tasks.produce_and_consume()
def loadEnronEmailsSequential(): clearSubKeyspace(db, 'ER') walker = EmailWalker(localdir) for email in walker: addEmail(db, email)