示例#1
0
文件: rank.py 项目: anukat2015/newman
# -*- coding: utf-8 -*-

import sys

sys.path.append("./demail")

from newman.utils.file import slurpA
from newman.utils.functions import head,last,nth

if __name__ == "__main__":

    recipients ={}

    SourceEmail = sys.argv[1]

    lines = slurpA("tmp/exploded.csv")
    for line in lines:
        (dt,src,target) = line.strip().split('\t')        
        
        if src != SourceEmail or target == SourceEmail:
            continue
        else:

            if target in recipients:
                recipients[target] += 1
            else:
                recipients[target] = 1

    ranked = sorted(recipients.items(),key=lambda x:(-x[1],x[0]))[:20]
    top = float(nth(head(ranked), 1))
    step = 1.0/top
示例#2
0
    ' FROM facts f where schema_name = "email_addr" and predicate = "community"'
)


def writeRanks(ids):
    with newman_connector() as read_cnx1, newman_connector(
    ) as read_cnx, newman_connector() as write_cnx:
        with execute_query(read_cnx1.conn(), stmt) as qry:
            txid = Tx(read_cnx.conn()).next()
            print "tx: %s" % txid
            facts = Fact(write_cnx.conn(), autocommit=False)
            print "assigning ranks"
            for mail in qry.cursor():
                #print mail[0] #, "email_addr", "rank", ids.get(mail,'0'), txid
                facts.addFact(mail[0], "email_addr", "rank",
                              ids.get(mail[0], '0'), txid)

            print "commit"
            write_cnx.commit()


if __name__ == "__main__":
    ids = {}
    lines = slurpA("tmp/rankings")
    for line in lines:
        rank, mails = line.split(':')
        mails = mails.strip()
        for mail in mails.split(','):
            ids[mail] = rank
    writeRanks(ids)
示例#3
0
    headers = [
        "id", "threadid", "dir", "category", "datetime", "from", "tos", "ccs",
        "bccs", "subject", "body", "tosize", "ccsize", "attachsize", "attach",
        "bodysize", "location"
    ]

    #skip header row for counting
    c = counter(-1)

    with newman_connector() as cnx:

        tx = Tx(cnx.conn()).next()
        print "tx: %s" % tx
        fact = Fact(cnx.conn(), autocommit=False)

        for line in slurpA(args.input_tsv):
            try:
                count = c.next()
                if count % 1000 == 0:
                    print "ingested count - %s " % count
                row = line.split('\t')
                row = (c.strip() for c in row)

                num, dir, category, utc_date, importance, fromemail, ip, toemail, ccemail, bccemail, attach, messageid, inreplyto, references, subject, body = row

                fromemail = lower(fromemail)
                toemail = lower(toemail)
                ccemail = lower(ccemail)
                bccemail = lower(bccemail)

                network = ''
示例#4
0
文件: rank.py 项目: molumbymj/newman
# -*- coding: utf-8 -*-

import sys

sys.path.append("./demail")

from newman.utils.file import slurpA
from newman.utils.functions import head, last, nth

if __name__ == "__main__":

    recipients = {}

    SourceEmail = sys.argv[1]

    lines = slurpA("tmp/exploded.csv")
    for line in lines:
        (dt, src, target) = line.strip().split('\t')

        if src != SourceEmail or target == SourceEmail:
            continue
        else:

            if target in recipients:
                recipients[target] += 1
            else:
                recipients[target] = 1

    ranked = sorted(recipients.items(), key=lambda x: (-x[1], x[0]))[:20]
    top = float(nth(head(ranked), 1))
    step = 1.0 / top
示例#5
0
stmt = (
    'SELECT distinct f.subject '
    ' FROM facts f where schema_name = "email_addr" and predicate = "community"'
)

def writeRanks(ids):
    with newman_connector() as read_cnx1, newman_connector() as read_cnx, newman_connector() as write_cnx:
        with execute_query(read_cnx1.conn(), stmt) as qry:
            txid = Tx(read_cnx.conn()).next()
            print "tx: %s" % txid
            facts = Fact(write_cnx.conn(), autocommit=False)
            print "assigning ranks"
            for mail in qry.cursor():
                #print mail[0] #, "email_addr", "rank", ids.get(mail,'0'), txid
                facts.addFact(mail[0], "email_addr", "rank", ids.get(mail[0],'0'), txid)            

            print "commit"
            write_cnx.commit()
            


if __name__ == "__main__":
    ids = {}
    lines = slurpA("tmp/rankings")
    for line in lines:
        rank,mails = line.split(':')
        mails = mails.strip()
        for mail in mails.split(','):
            ids[mail] = rank
    writeRanks(ids)
示例#6
0
    parser = argparse.ArgumentParser(description='Ingest Walker Email')
    parser.add_argument("input_tsv", help="input of tsv file")
    args= parser.parse_args()

    headers = ["id","threadid", "dir","category","datetime","from","tos","ccs","bccs","subject","body","tosize","ccsize","attachsize","attach","bodysize","location"]

    #skip header row for counting
    c = counter(-1)

    with newman_connector() as cnx:

        tx = Tx(cnx.conn()).next()
        print "tx: %s" % tx        
        fact = Fact(cnx.conn(), autocommit=False)

        for line in slurpA(args.input_tsv):
            try:
                count = c.next()
                if count % 1000 == 0:
                    print "ingested count - %s " % count
                row = line.split('\t')
                row = (c.strip() for c in row)
            
                num,dir,category,utc_date,importance,fromemail,ip,toemail,ccemail,bccemail,attach,messageid,inreplyto,references,subject,body = row

                fromemail = lower(fromemail)
                toemail = lower(toemail)
                ccemail = lower(ccemail)
                bccemail = lower(bccemail)
            
                network = ''
示例#7
0
            " values (%s, %s, %s, %s, %s, %s)")
    with execute_nonquery(conn, stmt, category_id, idx, value, score, purity, docs) as qry:
        pass

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Ingest Walker Email Topics')
    parser.add_argument("topic_idx", help="topics index file")
    parser.add_argument("topic_scores", help="topic scores file")
    args= parser.parse_args()

    flush = partial(flush_buffer, "tmp/bulk_topic_score.dat")
    
    #index   topic_score     doc_purity      percent_docs    summary0        summary1  etc... 
    #0     8.09   0.557   14.54   governor        state   jobs    candidate       rail    ad      gubernatorial   primary election        race

    scores_items = [line.split('\t') for line in slurpA(args.topic_idx)[1:]]
    scores_items = [map(lambda s: s.strip(), line) for line in scores_items]
    scores_items = [(i[0], i[1], i[2], i[3], " ".join(i[4:])) for i in scores_items]

    topics = {"topic_{0}".format(i[0]):i[1:] for i in scores_items}
    #topics = {"topic_{0}".format(i):v for i,v in enumerate(slurpA(args.topic_idx)) }

    c = counter(0)

    with newman_connector() as cnx:
        insert_topic = partial(insert_topic_category, cnx.conn(), "all")
        print "import topics "
        for k,v in topics.iteritems():
            idx = k.replace("topic_", "")
            #score, purity, docs, summary = v.split(None, 3)
            score, purity, docs, summary = v 
示例#8
0
                          docs) as qry:
        pass


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Ingest Walker Email Topics')
    parser.add_argument("topic_idx", help="topics index file")
    parser.add_argument("topic_scores", help="topic scores file")
    args = parser.parse_args()

    flush = partial(flush_buffer, "tmp/bulk_topic_score.dat")

    #index   topic_score     doc_purity      percent_docs    summary0        summary1  etc...
    #0     8.09   0.557   14.54   governor        state   jobs    candidate       rail    ad      gubernatorial   primary election        race

    scores_items = [line.split('\t') for line in slurpA(args.topic_idx)[1:]]
    scores_items = [map(lambda s: s.strip(), line) for line in scores_items]
    scores_items = [(i[0], i[1], i[2], i[3], " ".join(i[4:]))
                    for i in scores_items]

    topics = {"topic_{0}".format(i[0]): i[1:] for i in scores_items}
    #topics = {"topic_{0}".format(i):v for i,v in enumerate(slurpA(args.topic_idx)) }

    c = counter(0)

    with newman_connector() as cnx:
        insert_topic = partial(insert_topic_category, cnx.conn(), "all")
        print "import topics "
        for k, v in topics.iteritems():
            idx = k.replace("topic_", "")
            #score, purity, docs, summary = v.split(None, 3)