def get_attachments_json(msg): if not msg.attachments: return [] attach_count = counter() attach = [] for attachment in msg.attachments: b64 = base64.b64encode(attachment.data) fileName = attachment.longFilename if not fileName: fileName = attachment.shortFilename if not fileName: fileName = "attach_{}".format(attach_count.next()) _, extension = os.path.splitext(fileName.lower()) #TODO replace this call to "guess" with the mime type stored in the msg attachment mime_type = mimetypes.guess_type(fileName)[0] attach.append({ "filename": fileName, "guid": str(uuid.uuid1()), "extension": extension, "filesize": len(attachment.data) if attachment.data else 0, "contents64": b64, "content_type": mime_type }) return attach
def get_attachments_json(msg): if not msg.attachments: return [] attach_count = counter() attach=[] for attachment in msg.attachments: b64 = base64.b64encode(attachment.data) fileName = attachment.longFilename if not fileName: fileName = attachment.shortFilename if not fileName: fileName = "attach_{}".format(attach_count.next()) _, extension = os.path.splitext(fileName.lower()) #TODO replace this call to "guess" with the mime type stored in the msg attachment mime_type = mimetypes.guess_type(fileName)[0] attach.append({ "filename" : fileName, "guid" : str(uuid.uuid1()), "extension" : extension, "filesize": len(attachment.data) if attachment.data else 0, "contents64" : b64, "content_type" : mime_type }) return attach
def extract(email_id, message, categories, preserve_attachments=True): #message = email.message_from_string(buff_mail) attach = [] msg = u'' body_type = "text" attach_count = counter() for part in message.walk(): # We only want plain text which is not an attachment # TODO do we need to handle fileName == 'rtf-body.rtf'? valid_utf8 = True # Email body may be provided in text, text + html seperate parts or rarely as html only, if part.get_content_type( ) == 'text/plain' and part.get_filename(None) is None: body_type = "text" if msg: msg += u"\n=============================Next Part==============================\n" decode = part.get_all('Content-Transfer-Encoding', [''])[0] == 'base64' or part.get_all( 'Content-Transfer-Encoding', [''])[0] == 'quoted-printable' charset = part.get_content_charset() text = part.get_payload(decode=decode) if not charset: chardet.detect(text) text = str_to_unicode(text, encoding=charset) msg += text # writes raw message to txt file # spit("{}/{}.txt".format("tmp", email_id), text) # Handle html only body elif part.get_content_type( ) == 'text/html' and part.get_filename(None) is None: html = decode_body(part) body_type = "html" if msg: # If there is already text in the msg then just treat the html as raw text msg += u"\n=============================Next Part==============================\n" body_type = "text" msg += html if part.get_content_type() == 'message/delivery-status': continue if part.get_content_maintype() == 'multipart': continue if part.get('Content-Disposition') is None: continue fileName = part.get_filename() if not fileName and not preserve_attachments: continue fileName = convert_encoded( fileName) if fileName else "attach_{}".format(attach_count.next()) if fileName == 'rtf-body.rtf': continue _, extension = os.path.splitext(fileName.lower()) filename_guid = str(uuid.uuid1()) content_type = part.get_content_type() #filePath = "{}/attachments/{}{}".format(out_dir, filename_guid, extension) # #save attachment # fp = open(filePath, 'wb') # fp.write(part.get_payload(decode=True)) # fp.close() bstr = part.get_payload(decode=True) b64 = '' if bstr: b64 = base64.b64encode(bstr) attach.append({ "filename": fileName, "guid": filename_guid, "extension": extension, "filesize": len(bstr) if bstr else 0, "contents64": b64, "content_type": content_type }) # writes raw message to txt file #spit("{}/{}.txt".format(_dir, email_id), msg) try: row = createRow(email_id, message, attach, msg, body_type, categories) except Exception as e: print "Failed to process message: {} Exception:".format(email_id, e) print traceback.format_exc() raise e return row
def extract(email_id, message, categories): #message = email.message_from_string(buff_mail) attach = [] msg = "" attach_count = counter() for part in message.walk(): # We only want plain text which is not an attachment # TODO do we need to handle fileName == 'rtf-body.rtf'? if part.get_content_type( ) == 'text/plain' and not part.get('Content-Disposition'): if msg: msg += "\n=============================Next Part==============================\n" decode = part.get_all('Content-Transfer-Encoding', [''])[0] == 'base64' or part.get_all( 'Content-Transfer-Encoding', [''])[0] == 'quoted-printable' text = part.get_payload(decode=decode) if text.strip(): msg += text # get the charset for the part # part.get_param('charset') # part.get_all('Content-Transfer-Encoding') # TODO if also Content-Transfer_encoding = basse64 # TODO and charset = ???? # TODO need to decode here # TODO also this may happen mutiple times and probable have to add each back to the txt msg # TODO or at least make them attachments so they are available for view and indexing later # TODO ALSO -- RIGHT NOW MISSING THIS HTML PORTION # TODO this will often show up as Content-Type:'text/html' with similar text if part.get_content_type() == 'message/delivery-status': continue if part.get_content_maintype() == 'multipart': continue if part.get('Content-Disposition') is None: continue fileName = part.get_filename() fileName = fileName.lower() if fileName else "attach_{}".format( attach_count.next()) if fileName == 'rtf-body.rtf': continue fileName = clean_string(fileName, [ EXPR_OPTS['fix_utf8'], EXPR_OPTS['fix_forwardslash'], (r' ', '_'), (r'&', '_') ]) _, extension = os.path.splitext(fileName.lower()) filename_guid = str(uuid.uuid1()) #filePath = "{}/attachments/{}{}".format(out_dir, filename_guid, extension) # #save attachment # fp = open(filePath, 'wb') # fp.write(part.get_payload(decode=True)) # fp.close() bstr = part.get_payload(decode=True) b64 = base64.b64encode(bstr) attach.append({ "filename": fileName, "guid": filename_guid, "extension": extension, "filesize": len(bstr), "contents64": b64 }) msg = clean_string(msg, [EXPR_OPTS['fix_utf8']]) # writes raw message to txt file #spit("{}/{}.txt".format(_dir, email_id), msg) row = createRow(email_id, message, attach, msg, categories) return row
def extract(email_id, message, categories): #message = email.message_from_string(buff_mail) attach=[] msg = "" attach_count = counter() for part in message.walk(): # We only want plain text which is not an attachment # TODO do we need to handle fileName == 'rtf-body.rtf'? if part.get_content_type() == 'text/plain' and not part.get('Content-Disposition'): if msg: msg += "\n=============================Next Part==============================\n" decode = part.get_all('Content-Transfer-Encoding', [''])[0] == 'base64' or part.get_all('Content-Transfer-Encoding', [''])[0] == 'quoted-printable' text = part.get_payload(decode=decode) if text.strip(): msg += text # get the charset for the part # part.get_param('charset') # part.get_all('Content-Transfer-Encoding') # TODO if also Content-Transfer_encoding = basse64 # TODO and charset = ???? # TODO need to decode here # TODO also this may happen mutiple times and probable have to add each back to the txt msg # TODO or at least make them attachments so they are available for view and indexing later # TODO ALSO -- RIGHT NOW MISSING THIS HTML PORTION # TODO this will often show up as Content-Type:'text/html' with similar text if part.get_content_type() == 'message/delivery-status': continue if part.get_content_maintype() == 'multipart': continue if part.get('Content-Disposition') is None: continue fileName = part.get_filename() fileName = fileName.lower() if fileName else "attach_{}".format(attach_count.next()) if fileName == 'rtf-body.rtf': continue fileName = clean_string(fileName, [ EXPR_OPTS['fix_utf8'], EXPR_OPTS['fix_forwardslash'], (r' ', '_'), (r'&', '_')]) _, extension = os.path.splitext(fileName.lower()) filename_guid = str(uuid.uuid1()) #filePath = "{}/attachments/{}{}".format(out_dir, filename_guid, extension) # #save attachment # fp = open(filePath, 'wb') # fp.write(part.get_payload(decode=True)) # fp.close() bstr = part.get_payload(decode=True) b64 = base64.b64encode(bstr) attach.append({ "filename" : fileName, "guid" : filename_guid, "extension" : extension, "filesize": len(bstr), "contents64" : b64 }) msg = clean_string(msg, [EXPR_OPTS['fix_utf8']]) # writes raw message to txt file #spit("{}/{}.txt".format(_dir, email_id), msg) row= createRow(email_id, message, attach, msg, categories) return row
def extract(email_id, message, categories, preserve_attachments=True): #message = email.message_from_string(buff_mail) attach=[] msg = u'' body_type = "text" attach_count = counter() for part in message.walk(): # We only want plain text which is not an attachment # TODO do we need to handle fileName == 'rtf-body.rtf'? valid_utf8 = True # Email body may be provided in text, text + html seperate parts or rarely as html only, if part.get_content_type() == 'text/plain' and part.get_filename(None) is None: body_type = "text" if msg: msg += u"\n=============================Next Part==============================\n" decode = part.get_all('Content-Transfer-Encoding', [''])[0] == 'base64' or part.get_all('Content-Transfer-Encoding', [''])[0] == 'quoted-printable' charset = part.get_content_charset() text = part.get_payload(decode=decode) if not charset: chardet.detect(text) text = str_to_unicode(text, encoding=charset) msg += text # writes raw message to txt file # spit("{}/{}.txt".format("tmp", email_id), text) # Handle html only body elif part.get_content_type() == 'text/html' and part.get_filename(None) is None: html = decode_body(part) body_type = "html" if msg: # If there is already text in the msg then just treat the html as raw text msg += u"\n=============================Next Part==============================\n" body_type = "text" msg +=html if part.get_content_type() == 'message/delivery-status': continue if part.get_content_maintype() == 'multipart': continue if part.get('Content-Disposition') is None: continue fileName = part.get_filename() if not fileName and not preserve_attachments: continue fileName = convert_encoded(fileName) if fileName else "attach_{}".format(attach_count.next()) if fileName == 'rtf-body.rtf': continue _, extension = os.path.splitext(fileName.lower()) filename_guid = str(uuid.uuid1()) content_type = part.get_content_type() #filePath = "{}/attachments/{}{}".format(out_dir, filename_guid, extension) # #save attachment # fp = open(filePath, 'wb') # fp.write(part.get_payload(decode=True)) # fp.close() bstr = part.get_payload(decode=True) b64='' if bstr: b64 = base64.b64encode(bstr) attach.append({ "filename" : fileName, "guid" : filename_guid, "extension" : extension, "filesize": len(bstr) if bstr else 0, "contents64" : b64, "content_type" : content_type }) # writes raw message to txt file #spit("{}/{}.txt".format(_dir, email_id), msg) try: row = createRow(email_id, message, attach, msg, body_type, categories) except Exception as e: print "Failed to process message: {} Exception:".format(email_id, e) print traceback.format_exc() raise e return row