def ingestESTextResults(hits): stmt = ("insert into search_results (email_id) values (%s)") with newman_connector() as cnx: with execute_query(cnx.conn(), ("delete from search_results")) as _: pass for hit in hits: with execute_query(cnx.conn(), stmt, hit["_id"]) as qry: pass cnx.commit()
def queryEntity(email): with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt_email_entities_by_id, email) as qry: tangelo.log("node-vals: %s" % qry.stmt) rtn = [r for r in qry.cursor()] return rtn if rtn else []
def queryEmail(email): with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt_email_by_id, email) as qry: tangelo.log("node-vals: %s" % qry.stmt) rtn = qry.cursor().fetchone() tangelo.content_type("application/json") return rtn if rtn else []
def getExportable(*args): stmt = (" SELECT id, subject FROM email WHERE exportable='true' ") tangelo.content_type("application/json") with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt) as qry: rtn = [[str(val) for val in row] for row in qry.cursor()] return {"emails": rtn}
def getEdges(node_idx, field, args_array): with newman_connector() as read_cnx: tangelo.log("start edge query") with execute_query(*edgeQueryObj(read_cnx.conn(), field, args_array)) as qry: tangelo.log("edges : %s" % qry.stmt) return [{"source": node_idx.get(from_), "target": node_idx.get(to_), "value": int(weight)} for from_, to_, weight in qry.cursor()]
def getTopics(_id): with newman_connector() as cnx: with execute_query(cnx.conn(), topic_stmt, _id) as qry: return [{ 'name': formatName(nth(o, 0)), 'score': formatScore(nth(o, 1)) } for o in qry.cursor()]
def getDomains(*args): stmt = ( "SELECT SUBSTRING_INDEX(email_addr, '@', -1) as eml, count(1) from email_addr group by eml" ) tangelo.content_type("application/json") with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt) as qry: rtn = [[str(val) for val in row] for row in qry.cursor()] return {"domains": rtn}
def getExportable(*args): stmt = ( " SELECT id, subject FROM email WHERE exportable='true' " ) tangelo.content_type("application/json") with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt) as qry: rtn = [[str(val) for val in row] for row in qry.cursor()] return { "emails" : rtn }
def getDomains(*args): stmt = ( "SELECT SUBSTRING_INDEX(email_addr, '@', -1) as eml, count(1) from email_addr group by eml" ) tangelo.content_type("application/json") with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt) as qry: rtn = [[str(val) for val in row] for row in qry.cursor()] return { "domains" : rtn }
def topic_list(*args): category = nth(args, 0, 'all') with newman_connector() as read_cnx: stmt = (" select idx, value, docs from topic_category " " where category_id = %s " " order by idx ") with execute_query(read_cnx.conn(), stmt, category) as qry: rtn = [r for r in qry.cursor()] tangelo.content_type("application/json") return {"categories": rtn}
def getNodeVals(field, args_array): """ nodes should be the all of the emails an email addr is a part of and then all of then all of the email addr associated with that set of emails """ with newman_connector() as read_cnx: tangelo.log("start node query") with execute_query(*nodeQueryObj(read_cnx.conn(), field, args_array)) as qry: tangelo.log("node-vals: %s" % qry.stmt) return {item[0]: { 'num': int(item[4]+item[5]), 'comm_id': item[2], 'group_id': item[3], 'comm': item[1], 'rank': item[6] } for item in qry.cursor() }
def getRollup(*args): entity = urllib.unquote(nth(args, 0, "")) if not entity: return tangelo.HTTPStatusCode(400, "invalid service call - missing id") with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt_entity_rollup_id, entity) as qry: rtn = qry.cursor().fetchone() tangelo.content_type("application/json") return {"rollupId": rtn}
def getRollup(*args): entity = urllib.unquote(nth(args, 0, '')) if not entity: return tangelo.HTTPStatusCode(400, "invalid service call - missing id") with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt_entity_rollup_id, entity) as qry: rtn = qry.cursor().fetchone() tangelo.content_type("application/json") return {"rollupId": rtn}
def getRankedEmails(*args): tangelo.content_type("application/json") stmt = ( " select email_addr, community, community_id, group_id, rank, total_received, total_sent " " from email_addr " " where rank > 0 " " order by cast(rank as decimal(4,4)) desc") with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt) as qry: rtn = [[str(val) for val in row] for row in qry.cursor()] return {"emails": rtn}
def getTopRollup(*args): amt = urllib.unquote(nth(args, 0, "")) if not amt: return tangelo.HTTPStatusCode(400, "invalid service call - missing id") stmt = stmt_top_rollup_entities + ("limit {0}".format(amt)) with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt) as qry: rtn = [r for r in qry.cursor()] rtn = rtn if rtn else [] tangelo.content_type("application/json") return {"entities": rtn}
def topic_list(*args): category=nth(args, 0, 'all') with newman_connector() as read_cnx: stmt = ( " select idx, value, docs from topic_category " " where category_id = %s " " order by idx " ) with execute_query(read_cnx.conn(), stmt, category) as qry: rtn = [r for r in qry.cursor()] tangelo.content_type("application/json") return { "categories" : rtn }
def getEmails(colors, field, args_array): cols = ('num', 'directory', 'datetime', 'from', 'to', 'cc', 'bcc', 'subject', 'attach', 'bodysize') rows = [] with newman_connector() as read_cnx: tangelo.log("start email query") with execute_query(*emailQueryObj(read_cnx.conn(), field, args_array)) as qry: tangelo.log("emails : %s" % qry.stmt) for item in qry.cursor(): row = dict(zip(cols, item)) row["fromcolor"] = colors.get(row.get('from')) rows.append(row) return rows
def getRankedEmails(*args): tangelo.content_type("application/json") stmt = ( " select email_addr, community, community_id, group_id, rank, total_received, total_sent " " from email_addr " " where rank > 0 " " order by cast(rank as decimal(4,4)) desc" ) with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt) as qry: rtn = [[str(val) for val in row] for row in qry.cursor()] return { "emails" : rtn }
def getTopRollup(*args): amt = urllib.unquote(nth(args, 0, '')) if not amt: return tangelo.HTTPStatusCode(400, "invalid service call - missing id") stmt = stmt_top_rollup_entities + ("limit {0}".format(amt)) with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt) as qry: rtn = [r for r in qry.cursor()] rtn = rtn if rtn else [] tangelo.content_type("application/json") return {"entities": rtn}
def token_dict(cnx): stmt = ( 'select email_id, idx, offset, value, entity_type, subject from entity' ) with execute_query(read_cnx.conn(), stmt) as qry: rtn = dict() for row in qry.cursor(): email_id, idx, offset, value, entity_type, subject = row if email_id in rtn: rtn[email_id].append(row) else: rtn[email_id] = [row] return rtn
def writeRanks(ids): with newman_connector() as read_cnx1, newman_connector() as read_cnx, newman_connector() as write_cnx: with execute_query(read_cnx1.conn(), stmt) as qry: txid = Tx(read_cnx.conn()).next() print "tx: %s" % txid facts = Fact(write_cnx.conn(), autocommit=False) print "assigning ranks" for mail in qry.cursor(): #print mail[0] #, "email_addr", "rank", ids.get(mail,'0'), txid facts.addFact(mail[0], "email_addr", "rank", ids.get(mail[0],'0'), txid) print "commit" write_cnx.commit()
def getTarget(*args): # returns the users who's email is being analyzed #todo: read from file or config target = getOpt('target') stmt = ( " select e.email_addr, e.community, e.community_id, e.group_id, e.total_received, e.total_sent, e.rank " " from email_addr e " " where e.email_addr = %s ") tangelo.content_type("application/json") with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt, target) as qry: rtn = [[str(val) for val in row] for row in qry.cursor()] return {"email": rtn}
def getTarget(*args): # returns the users who's email is being analyzed #todo: read from file or config target = getOpt('target') stmt = ( " select e.email_addr, e.community, e.community_id, e.group_id, e.total_received, e.total_sent, e.rank " " from email_addr e " " where e.email_addr = %s " ) tangelo.content_type("application/json") with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt, target) as qry: rtn = [[str(val) for val in row] for row in qry.cursor()] return { "email" : rtn }
def getAttachmentsSender(*args): sender=urllib.unquote(nth(args, 0, '')) if not sender: return tangelo.HTTPStatusCode(400, "invalid service call - missing id") tangelo.content_type("application/json") stmt = ( " select id, dir, datetime, from_addr, tos, ccs, bccs, subject, attach, bodysize " " from email " " where from_addr = %s and attach != '' " ) with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt, sender) as qry: rtn = [[ val.encode('utf-8') if isinstance(val, basestring) else str(val) for val in row] for row in qry.cursor()] return { "sender": sender, "email_attachments" : rtn }
def writeRanks(ids): with newman_connector() as read_cnx1, newman_connector( ) as read_cnx, newman_connector() as write_cnx: with execute_query(read_cnx1.conn(), stmt) as qry: txid = Tx(read_cnx.conn()).next() print "tx: %s" % txid facts = Fact(write_cnx.conn(), autocommit=False) print "assigning ranks" for mail in qry.cursor(): #print mail[0] #, "email_addr", "rank", ids.get(mail,'0'), txid facts.addFact(mail[0], "email_addr", "rank", ids.get(mail[0], '0'), txid) print "commit" write_cnx.commit()
def getExploded(): output = open('./tmp/exploded.csv','w') with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt) as qry: for dt, frome, to, cc, bcc in qry.cursor(): for r in to.split(';'): if r: output.write('\t'.join((dt,frome,r.strip())) + '\n') for r in cc.split(';'): if r: output.write('\t'.join((dt,frome,r.strip())) + '\n') for r in bcc.split(';'): if r: output.write('\t'.join((dt,frome,r.strip())) + '\n') output.close()
def getExploded(): output = open('./tmp/exploded.csv', 'w') with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt) as qry: for dt, frome, to, cc, bcc in qry.cursor(): for r in to.split(';'): if r: output.write('\t'.join((dt, frome, r.strip())) + '\n') for r in cc.split(';'): if r: output.write('\t'.join((dt, frome, r.strip())) + '\n') for r in bcc.split(';'): if r: output.write('\t'.join((dt, frome, r.strip())) + '\n') output.close()
def email_scores(*args): email_id = unquote(nth(args, 0, '')) category = nth(args, 1, 'all') if not email_id: return tangelo.HTTPStatusCode(400, "invalid service call - missing email") stmt = (" select score from xref_email_topic_score " " where category_id = %s and email_id = %s " " order by idx ") with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt, category, email_id) as qry: rtn = [head(r) for r in qry.cursor()] tangelo.content_type("application/json") return {"scores": rtn, "email": email_id, "category": category}
def email_scores(*args): email_id=unquote(nth(args, 0, '')) category=nth(args, 1, 'all') if not email_id: return tangelo.HTTPStatusCode(400, "invalid service call - missing email") stmt = ( " select score from xref_email_topic_score " " where category_id = %s and email_id = %s " " order by idx " ) with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt, category, email_id) as qry: rtn = [head(r) for r in qry.cursor()] tangelo.content_type("application/json") return { "scores" : rtn, "email" : email_id, "category" : category }
def getAttachmentsSender(*args): sender = urllib.unquote(nth(args, 0, '')) if not sender: return tangelo.HTTPStatusCode(400, "invalid service call - missing id") tangelo.content_type("application/json") stmt = ( " select id, dir, datetime, from_addr, tos, ccs, bccs, subject, attach, bodysize " " from email " " where from_addr = %s and attach != '' ") with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt, sender) as qry: rtn = [[ val.encode('utf-8') if isinstance(val, basestring) else str(val) for val in row ] for row in qry.cursor()] return {"sender": sender, "email_attachments": rtn}
def buildExportable(*args): webroot = cherrypy.config.get("webroot") target = getOpt('target') base_src = "{}/emails/{}".format(webroot, target) tmp_dir = os.path.abspath("{}/../tmp/".format(webroot)) download_dir = "{}/downloads/".format(webroot) tar_gz = "export_{}".format(fmtNow()) base_dest = os.path.abspath("{}/../tmp/newman_dl".format(webroot)) if os.path.exists(base_dest): rmrf(base_dest) if not os.path.exists(download_dir): mkdir(download_dir) mkdir(base_dest) # Get list of paths... stmt = (" SELECT id, dir FROM email WHERE exportable='true' ") msg = '' paths_to_copy = [] tangelo.content_type("application/json") with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt) as qry: for email_id, val in qry.cursor(): src = "{}/{}/".format(base_src, val) dest = "{}/{}/".format(base_dest, val) shutil.copytree(src, dest) # compress dir shutil.make_archive("{}/{}".format(tmp_dir, tar_gz), "gztar", root_dir=base_dest) # move to web downloads mv("{}/{}.tar.gz".format(tmp_dir, tar_gz), "{}/{}.tar.gz".format(download_dir, tar_gz)) return {"file": "downloads/{}.tar.gz".format(tar_gz)}
def buildExportable(*args): webroot = cherrypy.config.get("webroot") target = getOpt('target') base_src = "{}/emails/{}".format(webroot,target) tmp_dir = os.path.abspath("{}/../tmp/".format(webroot)) download_dir = "{}/downloads/".format(webroot) tar_gz = "export_{}".format(fmtNow()) base_dest = os.path.abspath("{}/../tmp/newman_dl".format(webroot)) if os.path.exists(base_dest): rmrf(base_dest) if not os.path.exists(download_dir): mkdir(download_dir) mkdir(base_dest) # Get list of paths... stmt = ( " SELECT id, dir FROM email WHERE exportable='true' " ) msg = '' paths_to_copy = [] tangelo.content_type("application/json") with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt) as qry: for email_id, val in qry.cursor(): src = "{}/{}/".format(base_src,val) dest = "{}/{}/".format(base_dest, val) shutil.copytree(src, dest) # compress dir shutil.make_archive("{}/{}".format(tmp_dir, tar_gz), "gztar", root_dir=base_dest) # move to web downloads mv("{}/{}.tar.gz".format(tmp_dir, tar_gz), "{}/{}.tar.gz".format(download_dir, tar_gz)) return { "file" : "downloads/{}.tar.gz".format(tar_gz) }
args= parser.parse_args() with newman_connector() as read_cnx, newman_connector() as write_cnx: txid = Tx(read_cnx.conn()).next() print "tx: %s" % txid print "enrich sent to recipient communications" facts = Fact(write_cnx.conn(), autocommit=False) with execute_nonquery(write_cnx.conn(), stmt_sent_to, txid) as qry: pass write_cnx.commit() txid = Tx(read_cnx.conn()).next() print "tx: %s" % txid print "enrich total sent to " with execute_query(write_cnx.conn(), stmt_total_recipients, txid) as qry: pass write_cnx.commit() txid = Tx(read_cnx.conn()).next() print "tx: %s" % txid print "enrich total received " with execute_query(write_cnx.conn(), stmt_total_received, txid) as qry: pass write_cnx.commit() txid = Tx(read_cnx.conn()).next() print "tx: %s" % txid print "enrich total sent " with execute_query(read_cnx.conn(), stmt_total_sent, txid) as qry:
def getTopics(_id): with newman_connector() as cnx: with execute_query(cnx.conn(), topic_stmt, _id) as qry: return [{'name': formatName(nth(o, 0)), 'score': formatScore(nth(o, 1)) } for o in qry.cursor()]
def findLineNum(emailid): with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt_email_to_line_num, emailid) as qry: return head(qry.cursor().fetchone())
spit(f, "\n".join(buffer) + "\n") if __name__ == "__main__": print "loading NER model..." ner = named_entity_extractor('/srv/software/MITIE/MITIE-models/english/ner_model.dat') extract = partial(extract_entities, ner) print "\nTags output by this NER model:", ner.get_possible_ner_tags() c = counter(1) flush_entity = partial(flush_buffer, "tmp/entity_ingest.tsv") with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt) as qry: buffer_entity=[] for email_id, body in qry.cursor(): count = c.next() if count % 1000 == 0: print "processed: %s " % count r = extract(email_id, body) for i, item in enumerate(r): email_id, tag_name, entity, offset = item entity_id = "%s_entity_%s" % (email_id, i) buffer_entity.append("\t".join([entity_id, tag_name.lower(), str(i), entity, email_id, str(offset)])) flush_entity(buffer_entity)
def findEmailId(line_num): with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt_line_num_to_email, line_num) as qry: return head(qry.cursor().fetchone())
" group by id " " ) as t2 " " on u.rollup_id = t2.id " " set u.total_emails = t2.c ") stmt_populate_xref = (" insert into xref_rollup_entity (rollup_id, entity_id) " " select distinct r.rollup_id, e.subject " " from entity_rollup r join entity e " " on r.type = e.entity_type and r.val = e.value ") if __name__ == "__main__": parser = argparse.ArgumentParser(description='Roll up enities') args = parser.parse_args() with newman_connector() as write_cnx: print "rollup entities" with execute_query(write_cnx.conn(), stmt_rollup_entities) as qry: pass write_cnx.commit() print "entity update email totals" with execute_query(write_cnx.conn(), stmt_update_rollup_counts) as qry: pass write_cnx.commit() print "populate xref rollup to entity" with execute_query(write_cnx.conn(), stmt_populate_xref) as qry: pass write_cnx.commit()
args = parser.parse_args() with newman_connector() as read_cnx, newman_connector() as write_cnx: txid = Tx(read_cnx.conn()).next() print "tx: %s" % txid print "enrich sent to recipient communications" facts = Fact(write_cnx.conn(), autocommit=False) with execute_nonquery(write_cnx.conn(), stmt_sent_to, txid) as qry: pass write_cnx.commit() txid = Tx(read_cnx.conn()).next() print "tx: %s" % txid print "enrich total sent to " with execute_query(write_cnx.conn(), stmt_total_recipients, txid) as qry: pass write_cnx.commit() txid = Tx(read_cnx.conn()).next() print "tx: %s" % txid print "enrich total received " with execute_query(write_cnx.conn(), stmt_total_received, txid) as qry: pass write_cnx.commit() txid = Tx(read_cnx.conn()).next() print "tx: %s" % txid print "enrich total sent " with execute_query(read_cnx.conn(), stmt_total_sent, txid) as qry: pass
" on t1.subject = t2.subject and t1.schema_name = t2.schema_name " " where t1.schema_name = 'email' " " and t1.predicate = 'from' " " and t2.predicate in ('to', 'cc', 'bcc') " " group by t1.obj, t2.obj " " ) as bi_dir " " GROUP BY source, target " " ) as lvn " " group by source, target ") nodes = [] node_map = {} edges = [] with newman_connector() as cnx: with execute_query(cnx.conn(), stmt) as qry: c = counter() for row in qry.cursor(): src, target, weight = row if src not in node_map: node_map[src] = c.next() nodes.append({ 'name': src, 'community': 'n/a', 'idx': node_map[src] }) if target not in node_map: node_map[target] = c.next() nodes.append({
stmt_populate_xref = ( " insert into xref_rollup_entity (rollup_id, entity_id) " " select distinct r.rollup_id, e.subject " " from entity_rollup r join entity e " " on r.type = e.entity_type and r.val = e.value " ) if __name__ == "__main__": parser = argparse.ArgumentParser(description='Roll up enities') args= parser.parse_args() with newman_connector() as write_cnx: print "rollup entities" with execute_query(write_cnx.conn(), stmt_rollup_entities) as qry: pass write_cnx.commit() print "entity update email totals" with execute_query(write_cnx.conn(), stmt_update_rollup_counts) as qry: pass write_cnx.commit() print "populate xref rollup to entity" with execute_query(write_cnx.conn(), stmt_populate_xref) as qry: pass write_cnx.commit()
def current(self): with execute_query(self.conn, 'select max(tx) from tx') as qry: return next(iter(qry.cursor().fetchone()))