def extract(email_id, buff_mail, out_dir, categories, target_email): _dir = "{}/emails/{}".format(out_dir, email_id) mkdirp(_dir) #write raw email to new dir spit("{}/{}.eml".format(_dir, email_id), buff_mail) mail = email.message_from_string(buff_mail) attach=[] msg = "" attach_count = counter() for part in mail.walk(): if part.get_content_type() == 'text/plain': msg = msg + "\n" + part.get_payload() if part.get_content_type() == 'message/delivery-status': continue if part.get_content_maintype() == 'multipart': continue if part.get('Content-Disposition') is None: continue fileName = part.get_filename() fileName = fileName if fileName else "Attach_{}".format(attach_count.next()) if fileName == 'rtf-body.rtf': continue fileName = clean_string(fileName, [ EXPR_OPTS['fix_utf8'], EXPR_OPTS['fix_forwardslash'], (r' ', '_'), (r'&', '_')]) attach.append(fileName) filePath = "{}/{}".format(_dir, fileName) #save attachment fp = open(filePath, 'wb') fp.write(part.get_payload(decode=True)) fp.close() msg = clean_string(msg, [EXPR_OPTS['fix_utf8']]) spit("{}/{}.txt".format(_dir, email_id), msg) row= createRow(email_id, "emails/{}".format(email_id), target_email, mail, categories, attach, msg) return row
def extract(email_id, buff_mail, out_dir, categories, target_email): _dir = "{}/emails/{}".format(out_dir, email_id) mkdirp(_dir) #write raw email to new dir spit("{}/{}.eml".format(_dir, email_id), buff_mail) mail = email.message_from_string(buff_mail) attach = [] msg = "" attach_count = counter() for part in mail.walk(): if part.get_content_type() == 'text/plain': msg = msg + "\n" + part.get_payload() if part.get_content_type() == 'message/delivery-status': continue if part.get_content_maintype() == 'multipart': continue if part.get('Content-Disposition') is None: continue fileName = part.get_filename() fileName = fileName if fileName else "Attach_{}".format( attach_count.next()) if fileName == 'rtf-body.rtf': continue fileName = clean_string(fileName, [ EXPR_OPTS['fix_utf8'], EXPR_OPTS['fix_forwardslash'], (r' ', '_'), (r'&', '_') ]) attach.append(fileName) filePath = "{}/{}".format(_dir, fileName) #save attachment fp = open(filePath, 'wb') fp.write(part.get_payload(decode=True)) fp.close() msg = clean_string(msg, [EXPR_OPTS['fix_utf8']]) spit("{}/{}.txt".format(_dir, email_id), msg) row = createRow(email_id, "emails/{}".format(email_id), target_email, mail, categories, attach, msg) return row
cat 2006.txt | ./pst/normalize.py [email protected] demail/emails/[email protected] -a --start 0 --limit 1000 ''' parser = argparse.ArgumentParser( description=" ... ", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=desc) parser.add_argument("-a","--header", action='store_true', help="add header to output") parser.add_argument("-s","--start", type=int, default=0, help="start at line #") parser.add_argument("-l", "--limit", type=int, default=0, help="end at line #") parser.add_argument("target_email", help="Target Email") parser.add_argument("out_dir", help="Output Directory") parser.add_argument("infile", nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="Input File") args = parser.parse_args() outfile = "{}/output.csv".format(args.out_dir) mkdirp("{}/emails".format(args.out_dir)) if args.header: spit(outfile, email_extract.headerrow() + "\n") for i, line in enumerate(skip(args.infile, at_start=args.start)): if ((not args.limit == 0) and (i >= args.limit)): break; try: fp = line.strip() guid = email_extract.md5(fp) category = email_extract.categoryList(fp) buff = slurp(fp) row = email_extract.extract(guid, buff, args.out_dir, category, args.target_email) spit(outfile, row + "\n") except Exception as e:
help="start at line #") parser.add_argument("-l", "--limit", type=int, default=0, help="end at line #") parser.add_argument("target_email", help="Target Email") parser.add_argument("out_dir", help="Output Directory") parser.add_argument("infile", nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="Input File") args = parser.parse_args() outfile = "{}/output.csv".format(args.out_dir) mkdirp("{}/emails".format(args.out_dir)) if args.header: spit(outfile, email_extract.headerrow() + "\n") for i, line in enumerate(skip(args.infile, at_start=args.start)): if ((not args.limit == 0) and (i >= args.limit)): break try: fp = line.strip() guid = email_extract.md5(fp) category = email_extract.categoryList(fp) buff = slurp(fp) row = email_extract.extract(guid, buff, args.out_dir, category, args.target_email) spit(outfile, row + "\n")