def main(pool): emailDatabase = email_database.buildEmailDatabase() start_time = time.time() mostEmails = stat_mostEmails(pool,emailDatabase.values()) countDatabase = {} print "--------------------" print "Most emails sent:" for k in mostEmails: print str(k[0]) + " sent " + str(k[1]) + " emails" #keep a database of email counts to use later for the reading level countDatabase[k[0]] = k[1] print "-------------------" print "Most replied to emails" mostRepliedTo = stat_mostRepliedTo(pool,emailDatabase.values()) for k in mostRepliedTo[:20]: print str(k[0]) + " had " + str(k[1]) + " email replies to his/her email threads" mostThreads = stat_mostThreads(pool,emailDatabase.values()) print "--------------------" print "Most likely to reply to an email thread" for k in mostThreads[:20]: print str(k[0]) + " has a thread reply frequency score of " + str(k[1]) print "--------------------" readingLevel = stat_readingDifficulty(pool,emailDatabase.values()) avgReadingLevel = sorted([(k[0],k[1]/countDatabase[k[0]]) for k in readingLevel], key=lambda obj : (-obj[1],obj[0])) print "Highest grade reading level of emails" for k in avgReadingLevel[:10]: print str(k[0]) + " averaged a score of " + str(k[1]) print "-------------------" print "Lowest grade reading level of emails" for k in avgReadingLevel[::-1][:10]: print str(k[0]) + " averaged a score of " + str(k[1]) print "-------------------" print "finished in "+str(time.time() - start_time)+"s"
def main(): f = open("emails.js","w") emailDatabase = email_database.buildEmailDatabase() # complete list of threads emailList = [] threads = {} for k in emailDatabase: del emailDatabase[k]["content"] parsedEmail = re.search(EMAIL_PATTERN,emailDatabase[k]["from"]).group(0) parsedDate = email.utils.parsedate_tz(emailDatabase[k]["date"]) timestamp = time.mktime(parsedDate[:-1])+parsedDate[-1] emailDatabase[k]["from"] = parsedEmail emailList.append(emailDatabase[k]) try: threads[emailDatabase[k]["threadid"]].append((parsedEmail, timestamp, emailDatabase[k]["subject"])) except KeyError: threads[emailDatabase[k]["threadid"]] = [(parsedEmail, timestamp, emailDatabase[k]["subject"])] threadLinks = [] #thread mappings for thread in threads: #sort thread groups based on timestamp to get the first email sortedThreadEmails = sorted(threads[thread], key=lambda obj: obj[1]) for em in range(1,len(sortedThreadEmails)): if sortedThreadEmails[em][0] == sortedThreadEmails[0][0]: continue threadLinks.append({"source":sortedThreadEmails[em][0], "target":sortedThreadEmails[0][0], "subject":sortedThreadEmails[0][2]}) print("var threadlinks = "+json.dumps(threadLinks)+";",file=f) #emails by date dateList = {} for em in emailList: date = time.mktime(time.strptime(" ".join(em["date"].split()[1:4]), "%d %b %Y")) date = int(date)*1000 if date in dateList: dateList[date].append(em) else: dateList[date] = [em] print("var emailsbydate = "+json.dumps(dateList)+";",file=f) #email authors authors = [re.search(EMAIL_PATTERN,em["from"]).group(0) for em in emailList] print("var authors = "+json.dumps(list(set(authors)))+";",file=f)