Exemplo n.º 1
0
def root_to_json(root_dir, output_file):
    walker = EmailWalker(root_dir)
    output = open(output_file, "w")

    for email in walker:
        email['date'] = str(email['date'])
        line = JSONValueProtocol.write(None, email) + '\n'
        output.write(line)

    output.close()
Exemplo n.º 2
0
def walk_emails(folder):

    sender_tf = defaultdict(Counter)
    sender_counts = Counter()

    for e in EmailWalker(folder):
        # split by lines and filter out quotations
        text = ' '.join(
            filter(lambda line: not re.search('^\s+>+', line),
                   e['text'].split('\n')))

        terms_in_email = reasonable_words(
            text.split())  # split the email text using whitespaces

        terms_in_email = group_in_grams(2, terms_in_email)

        sender_tf[e['sender']].update(terms_in_email)
        sender_counts[e['sender']] += 1

    # Count all the senders that every term was sent by
    allterms = Counter()
    for sender_terms in sender_tf.itervalues():
        allterms.update(sender_terms.keys())

    # From the counts, compute the IDFs
    idfs = {}
    nsenders = len(sender_tf)  # num of keys is num of senders
    for term, sender_count in allterms.iteritems():
        idfs[term] = math.log(nsenders / (1.0 + sender_count))

    # Filter down to most frequent senders
    top_senders = sorted(sender_counts.items(),
                         key=itemgetter(1),
                         reverse=True)[:20]

    # Combine TFs and IDFs to make TFIDFs
    tfidfs = defaultdict(list)
    for sender, count in top_senders:
        for term, tf in sender_tf[sender].iteritems():
            tfidfs[sender].append((term, tf * idfs[term]))

    # Display
    for sender, count in top_senders:
        print sender, count

        sorted_by_count_top20 = sorted(tfidfs[sender],
                                       key=itemgetter(1),
                                       reverse=True)[:20]
        for pair in sorted_by_count_top20:
            print '\t', pair
Exemplo n.º 3
0
def walk_emails(folder):

    folder_tf = defaultdict(Counter)
    terms_per_folder = defaultdict(set)

    for e in EmailWalker(folder):
        terms_in_email = reasonable_words(
            e['text'].split())  # split the email text using whitespaces

        folder_tf[e['folder']].update(terms_in_email)

        # For the IDF, this collects all of the terms in each folder
        terms_per_folder[e['folder']].update(terms_in_email)

    # Count all the folders that every term appears in
    allterms = Counter()
    for terms in terms_per_folder.itervalues():
        allterms.update(terms)

    # From the counts, compute the IDFs
    idfs = {}
    nfolders = len(terms_per_folder)  # num of keys is num of folders
    for term, folder_count in allterms.iteritems():
        idfs[term] = math.log(nfolders / (1.0 + folder_count))

    # Combine TFs and IDFs to make TFIDFs
    tfidfs = defaultdict(dict)
    for folder, terms in folder_tf.iteritems():
        for term, tf in terms.iteritems():
            tfidfs[folder][term] = (tf * idfs[term])

    # Display
    seen_folders = set()
    similarities = {}
    for folder, scores in tfidfs.items():
        seen_folders.add(folder)
        for folder2, scores2 in tfidfs.items():
            if folder2 not in seen_folders:
                similarities[(folder, folder2)] = cos_sim(scores, scores2)

    sorted_by_sim_top_20 = sorted(similarities.items(),
                                  key=itemgetter(1),
                                  reverse=True)[:20]
    for folder_pair, similarity in sorted_by_sim_top_20:
        print folder_pair, similarity
Exemplo n.º 4
0
def walk_emails(folder):

    folder_tf = defaultdict(Counter)
    terms_per_folder = defaultdict(set)

    for e in EmailWalker(folder):
        terms_in_email = reasonable_words(
            e['text'].split())  # split the email text using whitespaces

        folder_tf[e['folder']].update(terms_in_email)

        # For the IDF, this collects all of the terms in each folder
        terms_per_folder[e['folder']].update(terms_in_email)

    # Count all the folders that every term appears in
    allterms = Counter()
    for terms in terms_per_folder.itervalues():
        allterms.update(terms)

    # From the counts, compute the IDFs
    idfs = {}
    nfolders = len(terms_per_folder)  # num of keys is num of folders
    for term, folder_count in allterms.iteritems():
        idfs[term] = math.log(nfolders / (1.0 + folder_count))

    # Combine TFs and IDFs to make TFIDFs
    tfidfs = defaultdict(list)
    for folder, terms in folder_tf.iteritems():
        for term, tf in terms.iteritems():
            tfidfs[folder].append((term, tf * idfs[term]))

    # Display
    for folder, scores in tfidfs.items():
        print folder
        sorted_by_count_top20 = sorted(scores, key=itemgetter(1),
                                       reverse=True)[:20]
        for pair in sorted_by_count_top20:
            print '\t', pair
Exemplo n.º 5
0
def loadEnronEmails(folder=''):
    clearSubKeyspace(db, 'ER')
    reader = EmailWalker(localdir + folder)
    tasks = BoundedBuffer(reader, addEmail)
    tasks.produce_and_consume()
Exemplo n.º 6
0
def loadEnronEmailsSequential():
    clearSubKeyspace(db, 'ER')
    walker = EmailWalker(localdir)
    for email in walker:
        addEmail(db, email)