def dumpMailList(mlname, firstDate=None, lastDate=None, delta=7): ml = MailList.select(MailList.q.name==mlname) if ml.count() < 1: raise KeyError("""Mailing List "%s" not found""" % (mlname)) if ml.count() > 1: raise KeyError("""Mailing List "%s" specifies multiple lists""" % (mlname)) ml = ml[0] # ignore all the messages we can't get a date for messages = MailMessage.select(AND(MailMessage.q.listID==ml.id, MailMessage.q.date!=None), orderBy=MailMessage.q.date) if not firstDate: firstDate = messages[0].date if not lastDate: lastDate = messages.reversed()[0].date print firstDate, lastDate firstDate = timeutil.makeDateTimeFromShortString("%04d%02d%02d" % (firstDate.year, firstDate.month, firstDate.day)) bins = [[],[],[],[]] while firstDate < lastDate: nextDate = firstDate + timeutil.makeTimeDelta(days=delta) messages = MailMessage.select(AND(MailMessage.q.listID==ml.id, MailMessage.q.date >= firstDate, MailMessage.q.date < nextDate)) numMessages = messages.count() newThreads = 0 oldThreads = 0 for msg in messages: if msg.replyTo == None: newThreads = newThreads + 1 else: oldThreads = oldThreads + 1 print "%04d-%02d-%02d, %d, %d, %d" % (firstDate.year, firstDate.month, firstDate.day, numMessages, newThreads, oldThreads) bins[0].append(abs(firstDate-MINDATE).days) bins[1].append(numMessages) bins[2].append(newThreads) bins[3].append(oldThreads) firstDate = nextDate return bins
help="Manually specify logging level (DEBUG, INFO, WARN, etc)", default="INFO", action="store") parser.add_option("-w", "--weeks", action="store", dest="weeks", type="int", default=WEEKSWINDOW, help="number of weeks to look back", metavar="WEEKS") parser.add_option("-o", "--overlap", action="store", dest="overlap", type="int", default=WEEKSOVERLAP, help="number of weeks to overlap, default=%d" % WEEKSOVERLAP, metavar="WEEKS") parser.add_option("--startdate", action="store", dest="startdate", type="string", default="19970301", help="date to start analysis on") parser.add_option("--stopdate", action="store", dest="stopdate", type="string", default="20050801", help="date to stop analysis") log.debug("parsing command line arguments") (options, args) = parser.parse_args() if options.debug: log.setLevel(logging.DEBUG) else: log.setLevel(getattr(logging,options.loglevel.upper())) startDate = timeutil.makeDateTimeFromShortString(options.startdate) stopDate = timeutil.makeDateTimeFromShortString(options.stopdate) # connect to the database log.debug("connecting to database: %s - debug=%s", options.uri, options.debug) connect(options.uri, debug=options.debug) buildData(weeks=options.weeks, start=startDate, stop=stopDate, overlap=options.overlap) # buildData(weeks=options.weeks)
(options, args) = parser.parse_args() if options.verbose: log.setLevel(logging.DEBUG) connect(options.uri, debug=options.verbose) clf() grid(True) xlabel("Date") ylabel("Number of posts") ax = subplot(111) ctr = 0 if options.startdate or options.stopdate: if options.stopdate: endPoint = timeutil.makeDateTimeFromShortString(options.stopdate) endPoint = abs(endPoint - MINDATE).days else: endPoint = ax.get_xlim()[1] if options.startdate: startPoint = timeutil.makeDateTimeFromShortString(options.startdate) startPoint = abs(startPoint - MINDATE).days else: startPoint = ax.get_xlim()[0] ax.set_xlim(startPoint, endPoint) for x in args: bins = dumpMailList(x, delta=30) plot_date(bins[0],bins[1], fmt="%s-" % (colors[ctr])) plot_date(bins[0],bins[2], fmt="%s--" % (colors[ctr])) plot_date(bins[0],bins[3], fmt="%s:" % (colors[ctr]))
def loadFile(filename, maillist, fromHack=False, purge=False, purge_only=False): """Loads and archive of mailing list messages into the database. Right now this function does not handle running multiple times over the same mailing list. That's an outsanding bug. @param filename: - the filename to load @param mc: a dict to cache messages into @param maillist: a dbobjects.MailList object to set as the list object The in-reply to isn't specified anymore, instead, the following SQL command will hopefully load all of the data and set everything right. UPDATE mail_message set message_parent=a.mail_message_id FROM (SELECT a.mail_message_id FROM mail_message a where a.message_id = in_reply_to) AS a WHERE message_parent is null and in_reply_to is not null; """ nummsgs = 0 referencesRE = re.compile(r"(<[^>]+>)") log.info("processing file %s", filename) shortFN = os.path.split(filename)[1] archive = MailFileArchive.select(AND(MailFileArchive.q.filename==shortFN, MailFileArchive.q.listID==maillist.id)) # FIXME: this is an outstanding bug that needs to be addressed, basically # we can't double load a file, in the future we should check to see if the # entries have already been handled if archive.count() > 0: if not purge: log.error("Archive %s has already been loaded. For right now, we don't handle this, in the future, we will.", filename) return 0 else: log.warn("Archive %s has already been loaded, proceeding with purge", filename) query = """DELETE FROM mail_message_to WHERE mail_message_id IN (select mail_message_id from mail_message where mail_file_archive_id=%d)""" % (archive[0].id) log.debug("executing query: %s", query) MailMessage._connection.query(query) query = """DELETE FROM mail_message_reference WHERE mail_message_id IN (select mail_message_id from mail_message where mail_file_archive_id=%d)""" % (archive[0].id) log.debug("executing query: %s", query) MailMessage._connection.query(query) query = "DELETE FROM mail_message WHERE mail_file_archive_id=%d" % (archive[0].id) log.debug("executing query: %s", query) MailMessage._connection.query(query) archive = archive[0] else: archive = None if purge_only: log.info("purge only called, returning") return 0 # try to get the month from archive short = os.path.splitext(shortFN) if short[1] == '.gz': short = os.path.splitext(short[0]) month = short[0].split("-")[-1] year = short[0].split("-")[-2] # build the start and stop dates for the archive startDate=timeutil.makeDateTimeFromShortString("%04d%02d01" % (int(year), timeutil.getMonth(month))) stopDate=timeutil.addMonths(startDate,1) - timeutil.makeTimeDelta(seconds=1) if not archive: archive = MailFileArchive(filename=shortFN, list=maillist, startDate=startDate, stopDate=stopDate) mbox = mailutil.MailList(filename) msg = mbox.next() lastDate = None while msg != None: log.debug("processing message: %s", msg['Message-Id']) fromList = [x for x in rfc822.AddressList(msg['From']).addresslist] toList = [x[1].lower() for x in rfc822.AddressList(msg['To']).addresslist] toNames = [x[0].lower() for x in rfc822.AddressList(msg['To']).addresslist] ccList = [x[1].lower() for x in rfc822.AddressList(msg['cc']).addresslist] ccNames = [x[0].lower() for x in rfc822.AddressList(msg['cc']).addresslist] try: msgFrom = fromList[0][1].lower() except: log.warn("From not properly defined") msgFrom = "*****@*****.**" try: msgFromName = fromList[0][0].lower() except: log.warn("From name not properly defined") msgFromName = None if fromHack: msgFrom = msg['From'].replace(" at ","@").split()[0] try: timestamp = timeutil.makeDateTimeFromTuple(rfc822.parsedate(msg['date'])) except: log.warn("Error parsing date: %s - setting to None", msg['date']) timestamp = None try: messageId = msg['Message-Id'].split(";")[0] except: messageId = None if not messageId: messageId = "::CVSMINER::-"+random_string(length=64) # FIXME: messageID should be a little more robust in searching out # properly formatted messages pl = deList(msg.get_payload()) # pl = str(msg.get_payload()) if hasattr(pl,"append"): log.debug("is list") tmpPl = "" for payload in pl: tmpPl = tmpPl + payload.get_payload() pl = tmpPl if msg['In-Reply-To']: replyTo = msg['In-Reply-To'][:255].split(";")[0].strip() else: replyTo = None if msgFrom: msgFrom = msgFrom[:255] if msgFromName: msgFromName = msgFromName[:255] if msg['Subject']: subject = msg['Subject'][:255] else: subject = "::CVSMINER:: Subject Not Defined" if messageId: messageId = messageId[:255] try: m = create_mail_message(fromemail=msgFrom, fromname=msgFromName, subject=subject, body=pl, date=timestamp, messageid=messageId, maillist=maillist, archive=archive, replyto=replyTo) except UnicodeError: log.error("Unable to parse message no matter how hard I try...") msg = mbox.next() continue # map all of the references for the message if msg['References']: map(lambda x: create_mail_reference(message=m, reference=x), referencesRE.findall(msg['References'])) # seen is a dict that we use to track already captured email # addresses seen = {} for recip in zip(toList, toNames): if not seen.has_key(recip[0]): try: mr = create_mail_recipient(message=m, toemail=recip[0], toname=recip[1], isto=True) seen[recip[0]] = 1 except UnicodeDecodeError: pass for recip in zip(ccList,ccNames): if not seen.has_key(recip[0]): try: mr = create_mail_recipient(message=m, toemail=recip[0], toname=recip[1], isto=False) seen[recip[0]] = 1 except UnicodeDecodeError: pass msg = mbox.next() nummsgs = nummsgs + 1 return nummsgs