예제 #1
0
def dumpMailList(mlname, firstDate=None, lastDate=None, delta=7):
    ml = MailList.select(MailList.q.name==mlname)
    if ml.count() < 1:
        raise KeyError("""Mailing List "%s" not found""" % (mlname))
    if ml.count() > 1:
        raise KeyError("""Mailing List "%s" specifies multiple lists""" % (mlname))
    ml = ml[0]

    # ignore all the messages we can't get a date for
    messages = MailMessage.select(AND(MailMessage.q.listID==ml.id,
                                      MailMessage.q.date!=None),
                                  orderBy=MailMessage.q.date)
    if not firstDate:
        firstDate = messages[0].date
    if not lastDate:
        lastDate = messages.reversed()[0].date
    print firstDate, lastDate

    firstDate = timeutil.makeDateTimeFromShortString("%04d%02d%02d" %
                                                     (firstDate.year,
                                                      firstDate.month,
                                                      firstDate.day))

    bins = [[],[],[],[]]
    
    while firstDate < lastDate:
        nextDate = firstDate + timeutil.makeTimeDelta(days=delta)
        messages = MailMessage.select(AND(MailMessage.q.listID==ml.id,
                                          MailMessage.q.date >= firstDate,
                                          MailMessage.q.date < nextDate))
        numMessages = messages.count()
        newThreads = 0
        oldThreads = 0
        for msg in messages:
            if msg.replyTo == None:
                newThreads = newThreads + 1
            else:
                oldThreads = oldThreads + 1
        print "%04d-%02d-%02d, %d, %d, %d" % (firstDate.year,
                                              firstDate.month,
                                              firstDate.day,
                                              numMessages,
                                              newThreads,
                                              oldThreads)
        bins[0].append(abs(firstDate-MINDATE).days)
        bins[1].append(numMessages)
        bins[2].append(newThreads)
        bins[3].append(oldThreads)
        
        firstDate = nextDate

    return bins
예제 #2
0
                      help="Manually specify logging level (DEBUG, INFO, WARN, etc)",
                      default="INFO", action="store")   
    parser.add_option("-w", "--weeks", action="store", dest="weeks",
                      type="int", default=WEEKSWINDOW, help="number of weeks to look back",
                      metavar="WEEKS")
    parser.add_option("-o", "--overlap", action="store", dest="overlap",
                      type="int", default=WEEKSOVERLAP, help="number of weeks to overlap, default=%d" % WEEKSOVERLAP,
                      metavar="WEEKS")
    parser.add_option("--startdate", action="store", dest="startdate",
                      type="string", default="19970301", help="date to start analysis on")
    parser.add_option("--stopdate", action="store", dest="stopdate",
                      type="string", default="20050801", help="date to stop analysis")
                      
    log.debug("parsing command line arguments")
    (options, args) = parser.parse_args()

    if options.debug:
        log.setLevel(logging.DEBUG)
    else:
        log.setLevel(getattr(logging,options.loglevel.upper()))

    startDate = timeutil.makeDateTimeFromShortString(options.startdate)
    stopDate = timeutil.makeDateTimeFromShortString(options.stopdate)
    
    # connect to the database
    log.debug("connecting to database: %s - debug=%s", options.uri, options.debug)
    connect(options.uri, debug=options.debug)

    buildData(weeks=options.weeks, start=startDate, stop=stopDate, overlap=options.overlap)
#    buildData(weeks=options.weeks)
예제 #3
0
    
    (options, args) = parser.parse_args()

    if options.verbose:
        log.setLevel(logging.DEBUG)
    connect(options.uri, debug=options.verbose)

    clf()
    grid(True)
    xlabel("Date")
    ylabel("Number of posts")
    ax = subplot(111)
    ctr = 0
    if options.startdate or options.stopdate:
        if options.stopdate:
            endPoint = timeutil.makeDateTimeFromShortString(options.stopdate)
            endPoint = abs(endPoint - MINDATE).days
        else:
            endPoint = ax.get_xlim()[1]
        if options.startdate:
            startPoint = timeutil.makeDateTimeFromShortString(options.startdate)
            startPoint = abs(startPoint - MINDATE).days
        else:
            startPoint = ax.get_xlim()[0]
        ax.set_xlim(startPoint, endPoint)

    for x in args:
        bins = dumpMailList(x, delta=30)
        plot_date(bins[0],bins[1], fmt="%s-" % (colors[ctr]))
        plot_date(bins[0],bins[2], fmt="%s--" % (colors[ctr]))
        plot_date(bins[0],bins[3], fmt="%s:" % (colors[ctr]))
예제 #4
0
def loadFile(filename, maillist, fromHack=False, purge=False, purge_only=False):
    """Loads and archive of mailing list messages into the database. Right
    now this function does not handle running multiple times over the
    same mailing list.  That's an outsanding bug.

    @param filename: - the filename to load
    @param mc: a dict to cache messages into
    @param maillist: a dbobjects.MailList object to set as the list object

    The in-reply to isn't specified anymore, instead, the following SQL
    command will hopefully load all of the data and set everything right.
    
    UPDATE mail_message set message_parent=a.mail_message_id
      FROM (SELECT a.mail_message_id FROM mail_message a where a.message_id = in_reply_to) AS a
     WHERE message_parent is null and in_reply_to is not null;
    """
    nummsgs = 0
    referencesRE = re.compile(r"(<[^>]+>)")

    log.info("processing file %s", filename)

    shortFN = os.path.split(filename)[1]
    archive = MailFileArchive.select(AND(MailFileArchive.q.filename==shortFN,
                                     MailFileArchive.q.listID==maillist.id))

    # FIXME: this is an outstanding bug that needs to be addressed, basically
    # we can't double load a file, in the future we should check to see if the
    # entries have already been handled
    if archive.count() > 0:
        if not purge:
            log.error("Archive %s has already been loaded.  For right now, we don't handle this, in the future, we will.", filename)
            return 0
        else:
            log.warn("Archive %s has already been loaded, proceeding with purge", filename)
            query = """DELETE FROM mail_message_to WHERE mail_message_id IN
                                   (select mail_message_id from mail_message where mail_file_archive_id=%d)""" % (archive[0].id)
            log.debug("executing query: %s", query)
            MailMessage._connection.query(query)
            query = """DELETE FROM mail_message_reference WHERE mail_message_id IN
                                   (select mail_message_id from mail_message where mail_file_archive_id=%d)""" % (archive[0].id)
            log.debug("executing query: %s", query)
            MailMessage._connection.query(query)
            query = "DELETE FROM mail_message WHERE mail_file_archive_id=%d" % (archive[0].id)
            log.debug("executing query: %s", query)
            MailMessage._connection.query(query)
            archive = archive[0]
    else:
        archive = None
    if purge_only:
        log.info("purge only called, returning")
        return 0
    # try to get the month from archive
    short = os.path.splitext(shortFN)
    if short[1] == '.gz':
        short = os.path.splitext(short[0])
    month = short[0].split("-")[-1]
    year = short[0].split("-")[-2]

    # build the start and stop dates for the archive
    startDate=timeutil.makeDateTimeFromShortString("%04d%02d01" % (int(year), timeutil.getMonth(month)))
    stopDate=timeutil.addMonths(startDate,1) - timeutil.makeTimeDelta(seconds=1)

    if not archive:
        archive = MailFileArchive(filename=shortFN, list=maillist,
                                  startDate=startDate, stopDate=stopDate)
    
    mbox = mailutil.MailList(filename)
    msg = mbox.next()
    lastDate = None
    while msg != None:
        log.debug("processing message: %s", msg['Message-Id'])
        fromList =  [x for x in rfc822.AddressList(msg['From']).addresslist]
        toList = [x[1].lower() for x in rfc822.AddressList(msg['To']).addresslist]
        toNames = [x[0].lower() for x in rfc822.AddressList(msg['To']).addresslist]
        ccList = [x[1].lower() for x in rfc822.AddressList(msg['cc']).addresslist]
        ccNames = [x[0].lower() for x in rfc822.AddressList(msg['cc']).addresslist]
        try:
            msgFrom = fromList[0][1].lower()
        except:
            log.warn("From not properly defined")
            msgFrom = "*****@*****.**"
        try:
            msgFromName = fromList[0][0].lower()
        except:
            log.warn("From name not properly defined")
            msgFromName = None

        if fromHack:
            msgFrom = msg['From'].replace(" at ","@").split()[0]

        try:
            timestamp = timeutil.makeDateTimeFromTuple(rfc822.parsedate(msg['date']))
        except:
            log.warn("Error parsing date: %s - setting to None", msg['date'])
            timestamp = None

        try:
            messageId = msg['Message-Id'].split(";")[0]
        except:
            messageId = None
        if not messageId:
            messageId = "::CVSMINER::-"+random_string(length=64)
        # FIXME: messageID should be a little more robust in searching out
        # properly formatted messages

        pl = deList(msg.get_payload())
        # pl = str(msg.get_payload())
        if hasattr(pl,"append"):
            log.debug("is list")
            tmpPl = ""
            for payload in pl:
                tmpPl = tmpPl + payload.get_payload()
            pl = tmpPl

        if msg['In-Reply-To']:
            replyTo = msg['In-Reply-To'][:255].split(";")[0].strip()
        else:
            replyTo = None
            
        if msgFrom: msgFrom = msgFrom[:255]
        if msgFromName: msgFromName = msgFromName[:255]
        if msg['Subject']:
            subject = msg['Subject'][:255]
        else:
            subject = "::CVSMINER:: Subject Not Defined"
        if messageId: messageId = messageId[:255]

        try:
            m = create_mail_message(fromemail=msgFrom, fromname=msgFromName, subject=subject, body=pl,
                                    date=timestamp, messageid=messageId, maillist=maillist,
                                    archive=archive, replyto=replyTo)
        except UnicodeError:
            log.error("Unable to parse message no matter how hard I try...")
            msg = mbox.next()
            continue
                

        # map all of the references for the message
        if msg['References']: map(lambda x: create_mail_reference(message=m, reference=x), referencesRE.findall(msg['References']))

        # seen is a dict that we use to track already captured email
        # addresses
        seen = {}
        for recip in zip(toList, toNames):
            if not seen.has_key(recip[0]):
                try:
                    mr = create_mail_recipient(message=m, toemail=recip[0], toname=recip[1], isto=True)
                    seen[recip[0]] = 1
                except UnicodeDecodeError:
                    pass
        for recip in zip(ccList,ccNames):
            if not seen.has_key(recip[0]):
                try:
                    
                    mr = create_mail_recipient(message=m, toemail=recip[0], toname=recip[1], isto=False)
                    seen[recip[0]] = 1
                except UnicodeDecodeError:
                    pass
            
        msg = mbox.next()
        nummsgs = nummsgs + 1
    return nummsgs