def __init__(self, directory, filename, extension, limit_megabytes=10): self.directory = directory self.filename = filename self.extension = extension self.counter = counter(0) self.current_file = '' self.limit_bytes = limit_megabytes*1024*1024
def __init__(self, directory, filename, extension, limit_megabytes=10): self.directory = directory self.filename = filename self.extension = extension self.counter = counter(0) self.current_file = '' self.limit_bytes = limit_megabytes * 1024 * 1024
def extract(email_id, buff_mail, out_dir, categories, target_email): _dir = "{}/emails/{}".format(out_dir, email_id) mkdirp(_dir) #write raw email to new dir spit("{}/{}.eml".format(_dir, email_id), buff_mail) mail = email.message_from_string(buff_mail) attach=[] msg = "" attach_count = counter() for part in mail.walk(): if part.get_content_type() == 'text/plain': msg = msg + "\n" + part.get_payload() if part.get_content_type() == 'message/delivery-status': continue if part.get_content_maintype() == 'multipart': continue if part.get('Content-Disposition') is None: continue fileName = part.get_filename() fileName = fileName if fileName else "Attach_{}".format(attach_count.next()) if fileName == 'rtf-body.rtf': continue fileName = clean_string(fileName, [ EXPR_OPTS['fix_utf8'], EXPR_OPTS['fix_forwardslash'], (r' ', '_'), (r'&', '_')]) attach.append(fileName) filePath = "{}/{}".format(_dir, fileName) #save attachment fp = open(filePath, 'wb') fp.write(part.get_payload(decode=True)) fp.close() msg = clean_string(msg, [EXPR_OPTS['fix_utf8']]) spit("{}/{}.txt".format(_dir, email_id), msg) row= createRow(email_id, "emails/{}".format(email_id), target_email, mail, categories, attach, msg) return row
def extract(email_id, buff_mail, out_dir, categories, target_email): _dir = "{}/emails/{}".format(out_dir, email_id) mkdirp(_dir) #write raw email to new dir spit("{}/{}.eml".format(_dir, email_id), buff_mail) mail = email.message_from_string(buff_mail) attach = [] msg = "" attach_count = counter() for part in mail.walk(): if part.get_content_type() == 'text/plain': msg = msg + "\n" + part.get_payload() if part.get_content_type() == 'message/delivery-status': continue if part.get_content_maintype() == 'multipart': continue if part.get('Content-Disposition') is None: continue fileName = part.get_filename() fileName = fileName if fileName else "Attach_{}".format( attach_count.next()) if fileName == 'rtf-body.rtf': continue fileName = clean_string(fileName, [ EXPR_OPTS['fix_utf8'], EXPR_OPTS['fix_forwardslash'], (r' ', '_'), (r'&', '_') ]) attach.append(fileName) filePath = "{}/{}".format(_dir, fileName) #save attachment fp = open(filePath, 'wb') fp.write(part.get_payload(decode=True)) fp.close() msg = clean_string(msg, [EXPR_OPTS['fix_utf8']]) spit("{}/{}.txt".format(_dir, email_id), msg) row = createRow(email_id, "emails/{}".format(email_id), target_email, mail, categories, attach, msg) return row
" where t1.schema_name = 'email' " " and t1.predicate = 'from' " " and t2.predicate in ('to', 'cc', 'bcc') " " group by t1.obj, t2.obj " " ) as bi_dir " " GROUP BY source, target " " ) as lvn " " group by source, target ") nodes = [] node_map = {} edges = [] with newman_connector() as cnx: with execute_query(cnx.conn(), stmt) as qry: c = counter() for row in qry.cursor(): src, target, weight = row if src not in node_map: node_map[src] = c.next() nodes.append({ 'name': src, 'community': 'n/a', 'idx': node_map[src] }) if target not in node_map: node_map[target] = c.next() nodes.append({ 'name': target,
def download(srv, target_email, outdir, limit, logfile): srv.select("[Gmail]/All Mail", True) #resp, data = srv.uid('SEARCH', None, 'ALL') resp, data = srv.search(None, 'ALL') if resp != 'OK': err_msg = "Error searching: %s %s" % (resp, data) spit(logfile, "[Error] {}\n".format(err_msg)) raise Exception(err_msg) msgids = data[0].split() if limit > 0: msgids = msgids[-limit:] attach_count = counter() c = counter() l = len(msgids) for msgid in msgids: try: uid = getUIDForMessage(srv, msgid) fldr ="emails/{}".format(uid) mkdir("{}/{}".format(outdir, fldr)) i = c.next() if i % 200 == 0: spit(logfile, "[Downloading] Downloaded: {}/{}\n".format(i,l)) resp, msgParts = srv.fetch(msgid, '(RFC822)') if resp != 'OK': err_msg = "Bad response: %s %s" % (resp, msgParts) spit(logfile, "[Error] {}\n".format(err_msg)) raise Exception(err_msg) emailBody = msgParts[0][1] spit("{}/{}/{}.eml".format(outdir,fldr, uid), emailBody) mail = email.message_from_string(emailBody) attach = [] msg="" for part in mail.walk(): if part.get_content_type() == 'text/plain': msg = msg + "\n" + part.get_payload() if part.get_content_maintype() == 'multipart': continue if part.get('Content-Disposition') is None: continue fileName = part.get_filename() #escape file name fileName = fileName if fileName else "Attach_{}".format(attach_count.next()) fileName = fileName.replace('/','_') attach.append(fileName) filePath = "{}/{}/{}".format(outdir, fldr, fileName) fp = open(filePath, 'wb') fp.write(part.get_payload(decode=True)) fp.close() msg = re.sub(r'[^\x00-\x7F]',' ', msg) spit("{}/{}/{}.txt".format(outdir,fldr, uid), msg) row = createRow(uid, fldr, target_email, mail, attach, msg) spit("{}/output.csv".format(outdir), row + "\n") except Exception, e: spit(logfile, "[Downloading] [Exception]: line {}, msgid {}, except {}\n".format(i,msgid, str(e))) continue
def download(srv, target_email, outdir, limit, logfile): srv.select("[Gmail]/All Mail", True) #resp, data = srv.uid('SEARCH', None, 'ALL') resp, data = srv.search(None, 'ALL') if resp != 'OK': err_msg = "Error searching: %s %s" % (resp, data) spit(logfile, "[Error] {}\n".format(err_msg)) raise Exception(err_msg) msgids = data[0].split() if limit > 0: msgids = msgids[-limit:] attach_count = counter() c = counter() l = len(msgids) for msgid in msgids: try: uid = getUIDForMessage(srv, msgid) fldr = "emails/{}".format(uid) mkdir("{}/{}".format(outdir, fldr)) i = c.next() if i % 200 == 0: spit(logfile, "[Downloading] Downloaded: {}/{}\n".format(i, l)) resp, msgParts = srv.fetch(msgid, '(RFC822)') if resp != 'OK': err_msg = "Bad response: %s %s" % (resp, msgParts) spit(logfile, "[Error] {}\n".format(err_msg)) raise Exception(err_msg) emailBody = msgParts[0][1] spit("{}/{}/{}.eml".format(outdir, fldr, uid), emailBody) mail = email.message_from_string(emailBody) attach = [] msg = "" for part in mail.walk(): if part.get_content_type() == 'text/plain': msg = msg + "\n" + part.get_payload() if part.get_content_maintype() == 'multipart': continue if part.get('Content-Disposition') is None: continue fileName = part.get_filename() #escape file name fileName = fileName if fileName else "Attach_{}".format( attach_count.next()) fileName = fileName.replace('/', '_') attach.append(fileName) filePath = "{}/{}/{}".format(outdir, fldr, fileName) fp = open(filePath, 'wb') fp.write(part.get_payload(decode=True)) fp.close() msg = re.sub(r'[^\x00-\x7F]', ' ', msg) spit("{}/{}/{}.txt".format(outdir, fldr, uid), msg) row = createRow(uid, fldr, target_email, mail, attach, msg) spit("{}/output.csv".format(outdir), row + "\n") except Exception, e: spit( logfile, "[Downloading] [Exception]: line {}, msgid {}, except {}\n". format(i, msgid, str(e))) continue
" and t1.predicate = 'from' " " and t2.predicate in ('to', 'cc', 'bcc') " " group by t1.obj, t2.obj " " ) as bi_dir " " GROUP BY source, target " " ) as lvn " " group by source, target " ) nodes = [] node_map = {} edges = [] with newman_connector() as cnx: with execute_query(cnx.conn(), stmt) as qry: c = counter() for row in qry.cursor(): src, target, weight = row if src not in node_map: node_map[src] = c.next() nodes.append({'name': src, 'community': 'n/a', 'idx': node_map[src] }) if target not in node_map: node_map[target] = c.next() nodes.append({'name': target, 'community': 'n/a', 'idx': node_map[target] })