예제 #1
0
def get_attachments_json(msg):
    if not msg.attachments:
        return []

    attach_count = counter()
    attach = []

    for attachment in msg.attachments:
        b64 = base64.b64encode(attachment.data)
        fileName = attachment.longFilename
        if not fileName:
            fileName = attachment.shortFilename
        if not fileName:
            fileName = "attach_{}".format(attach_count.next())

        _, extension = os.path.splitext(fileName.lower())

        #TODO replace this call to "guess"  with the mime type stored in the msg attachment
        mime_type = mimetypes.guess_type(fileName)[0]

        attach.append({
            "filename": fileName,
            "guid": str(uuid.uuid1()),
            "extension": extension,
            "filesize": len(attachment.data) if attachment.data else 0,
            "contents64": b64,
            "content_type": mime_type
        })
    return attach
예제 #2
0
def get_attachments_json(msg):
    if not msg.attachments:
        return []

    attach_count = counter()
    attach=[]

    for attachment in msg.attachments:
        b64 = base64.b64encode(attachment.data)
        fileName = attachment.longFilename
        if not fileName:
            fileName = attachment.shortFilename
        if not fileName:
            fileName = "attach_{}".format(attach_count.next())

        _, extension = os.path.splitext(fileName.lower())

        #TODO replace this call to "guess"  with the mime type stored in the msg attachment
        mime_type = mimetypes.guess_type(fileName)[0]

        attach.append({ "filename" : fileName,
                        "guid" : str(uuid.uuid1()),
                        "extension" : extension,
                        "filesize": len(attachment.data) if attachment.data else 0,
                        "contents64" : b64,
                        "content_type" : mime_type
                        })
    return attach
def extract(email_id, message, categories, preserve_attachments=True):
    #message = email.message_from_string(buff_mail)
    attach = []
    msg = u''
    body_type = "text"
    attach_count = counter()

    for part in message.walk():
        # We only want plain text which is not an attachment
        # TODO do we need to handle fileName == 'rtf-body.rtf'?
        valid_utf8 = True

        # Email body may be provided in text, text + html seperate parts or rarely as html only,
        if part.get_content_type(
        ) == 'text/plain' and part.get_filename(None) is None:

            body_type = "text"
            if msg:
                msg += u"\n=============================Next Part==============================\n"
            decode = part.get_all('Content-Transfer-Encoding',
                                  [''])[0] == 'base64' or part.get_all(
                                      'Content-Transfer-Encoding',
                                      [''])[0] == 'quoted-printable'

            charset = part.get_content_charset()

            text = part.get_payload(decode=decode)
            if not charset:
                chardet.detect(text)

            text = str_to_unicode(text, encoding=charset)
            msg += text

            # writes raw message to txt file
            # spit("{}/{}.txt".format("tmp", email_id), text)
        # Handle html only body
        elif part.get_content_type(
        ) == 'text/html' and part.get_filename(None) is None:
            html = decode_body(part)
            body_type = "html"
            if msg:
                # If there is already text in the msg then just treat the html as raw text
                msg += u"\n=============================Next Part==============================\n"
                body_type = "text"

            msg += html

        if part.get_content_type() == 'message/delivery-status':
            continue
        if part.get_content_maintype() == 'multipart':
            continue

        if part.get('Content-Disposition') is None:
            continue

        fileName = part.get_filename()

        if not fileName and not preserve_attachments:
            continue

        fileName = convert_encoded(
            fileName) if fileName else "attach_{}".format(attach_count.next())

        if fileName == 'rtf-body.rtf':
            continue

        _, extension = os.path.splitext(fileName.lower())
        filename_guid = str(uuid.uuid1())

        content_type = part.get_content_type()

        #filePath = "{}/attachments/{}{}".format(out_dir, filename_guid, extension)
        # #save attachment
        # fp = open(filePath, 'wb')
        # fp.write(part.get_payload(decode=True))
        # fp.close()

        bstr = part.get_payload(decode=True)
        b64 = ''
        if bstr:
            b64 = base64.b64encode(bstr)

        attach.append({
            "filename": fileName,
            "guid": filename_guid,
            "extension": extension,
            "filesize": len(bstr) if bstr else 0,
            "contents64": b64,
            "content_type": content_type
        })

    # writes raw message to txt file
    #spit("{}/{}.txt".format(_dir, email_id), msg)

    try:
        row = createRow(email_id, message, attach, msg, body_type, categories)
    except Exception as e:
        print "Failed to process message: {} Exception:".format(email_id, e)
        print traceback.format_exc()
        raise e

    return row
예제 #4
0
def extract(email_id, message, categories):
    #message = email.message_from_string(buff_mail)
    attach = []
    msg = ""
    attach_count = counter()

    for part in message.walk():
        # We only want plain text which is not an attachment
        # TODO do we need to handle fileName == 'rtf-body.rtf'?
        if part.get_content_type(
        ) == 'text/plain' and not part.get('Content-Disposition'):
            if msg:
                msg += "\n=============================Next Part==============================\n"
            decode = part.get_all('Content-Transfer-Encoding',
                                  [''])[0] == 'base64' or part.get_all(
                                      'Content-Transfer-Encoding',
                                      [''])[0] == 'quoted-printable'

            text = part.get_payload(decode=decode)
            if text.strip():
                msg += text

        #     get the charset for the part
        #     part.get_param('charset')
        #     part.get_all('Content-Transfer-Encoding')
        #     TODO if also Content-Transfer_encoding = basse64
        # TODO and charset = ????
        # TODO need to decode here
        # TODO also this may happen mutiple times and probable have to add  each back to the txt msg
        # TODO or at least make them attachments so they are available for view and indexing later
        # TODO ALSO  -- RIGHT NOW MISSING THIS HTML PORTION
        # TODO this will often show up as Content-Type:'text/html' with similar text
        if part.get_content_type() == 'message/delivery-status':
            continue
        if part.get_content_maintype() == 'multipart':
            continue
        if part.get('Content-Disposition') is None:
            continue

        fileName = part.get_filename()
        fileName = fileName.lower() if fileName else "attach_{}".format(
            attach_count.next())

        if fileName == 'rtf-body.rtf':
            continue

        fileName = clean_string(fileName, [
            EXPR_OPTS['fix_utf8'], EXPR_OPTS['fix_forwardslash'], (r' ', '_'),
            (r'&', '_')
        ])

        _, extension = os.path.splitext(fileName.lower())
        filename_guid = str(uuid.uuid1())

        #filePath = "{}/attachments/{}{}".format(out_dir, filename_guid, extension)
        # #save attachment
        # fp = open(filePath, 'wb')
        # fp.write(part.get_payload(decode=True))
        # fp.close()

        bstr = part.get_payload(decode=True)
        b64 = base64.b64encode(bstr)

        attach.append({
            "filename": fileName,
            "guid": filename_guid,
            "extension": extension,
            "filesize": len(bstr),
            "contents64": b64
        })

    msg = clean_string(msg, [EXPR_OPTS['fix_utf8']])
    # writes raw message to txt file
    #spit("{}/{}.txt".format(_dir, email_id), msg)
    row = createRow(email_id, message, attach, msg, categories)

    return row
예제 #5
0
def extract(email_id, message, categories):
    #message = email.message_from_string(buff_mail)
    attach=[]
    msg = ""
    attach_count = counter()

    for part in message.walk():
        # We only want plain text which is not an attachment
        # TODO do we need to handle fileName == 'rtf-body.rtf'?
        if part.get_content_type() == 'text/plain' and not part.get('Content-Disposition'):
            if msg:
                msg += "\n=============================Next Part==============================\n"
            decode = part.get_all('Content-Transfer-Encoding', [''])[0] == 'base64' or part.get_all('Content-Transfer-Encoding', [''])[0] == 'quoted-printable'

            text = part.get_payload(decode=decode)
            if text.strip():
                msg += text

        #     get the charset for the part
        #     part.get_param('charset')
        #     part.get_all('Content-Transfer-Encoding')
        #     TODO if also Content-Transfer_encoding = basse64
        # TODO and charset = ????
        # TODO need to decode here
        # TODO also this may happen mutiple times and probable have to add  each back to the txt msg
        # TODO or at least make them attachments so they are available for view and indexing later
        # TODO ALSO  -- RIGHT NOW MISSING THIS HTML PORTION
        # TODO this will often show up as Content-Type:'text/html' with similar text
        if part.get_content_type() == 'message/delivery-status':
            continue
        if part.get_content_maintype() == 'multipart':
            continue
        if part.get('Content-Disposition') is None:
            continue

        fileName = part.get_filename()
        fileName = fileName.lower() if fileName else "attach_{}".format(attach_count.next())
        
        if fileName == 'rtf-body.rtf':
            continue

        fileName = clean_string(fileName, [
            EXPR_OPTS['fix_utf8'], 
            EXPR_OPTS['fix_forwardslash'], 
            (r' ', '_'),
            (r'&', '_')])
        
        _, extension = os.path.splitext(fileName.lower())
        filename_guid = str(uuid.uuid1())

        #filePath = "{}/attachments/{}{}".format(out_dir, filename_guid, extension)        
        # #save attachment
        # fp = open(filePath, 'wb')
        # fp.write(part.get_payload(decode=True))
        # fp.close()
        
        bstr = part.get_payload(decode=True)
        b64 = base64.b64encode(bstr)

        attach.append({ "filename" : fileName,
                        "guid" : filename_guid,
                        "extension" : extension,
                        "filesize": len(bstr),
                        "contents64" : b64
        })

    msg = clean_string(msg, [EXPR_OPTS['fix_utf8']])
    # writes raw message to txt file
    #spit("{}/{}.txt".format(_dir, email_id), msg)
    row= createRow(email_id, message, attach, msg, categories)

    return row
def extract(email_id, message, categories, preserve_attachments=True):
    #message = email.message_from_string(buff_mail)
    attach=[]
    msg = u''
    body_type = "text"
    attach_count = counter()

    for part in message.walk():
        # We only want plain text which is not an attachment
        # TODO do we need to handle fileName == 'rtf-body.rtf'?
        valid_utf8 = True

        # Email body may be provided in text, text + html seperate parts or rarely as html only,
        if part.get_content_type() == 'text/plain' and part.get_filename(None) is None:

            body_type = "text"
            if msg:
                msg += u"\n=============================Next Part==============================\n"
            decode = part.get_all('Content-Transfer-Encoding', [''])[0] == 'base64' or part.get_all('Content-Transfer-Encoding', [''])[0] == 'quoted-printable'

            charset = part.get_content_charset()

            text = part.get_payload(decode=decode)
            if not charset:
                chardet.detect(text)

            text = str_to_unicode(text, encoding=charset)
            msg += text

            # writes raw message to txt file
            # spit("{}/{}.txt".format("tmp", email_id), text)
        # Handle html only body
        elif part.get_content_type() == 'text/html' and part.get_filename(None) is None:
            html = decode_body(part)
            body_type = "html"
            if msg:
                # If there is already text in the msg then just treat the html as raw text
                msg += u"\n=============================Next Part==============================\n"
                body_type = "text"

            msg +=html

        if part.get_content_type() == 'message/delivery-status':
            continue
        if part.get_content_maintype() == 'multipart':
            continue

        if part.get('Content-Disposition') is None:
            continue

        fileName = part.get_filename()

        if not fileName and not preserve_attachments:
            continue

        fileName = convert_encoded(fileName) if fileName else "attach_{}".format(attach_count.next())

        if fileName == 'rtf-body.rtf':
            continue

        _, extension = os.path.splitext(fileName.lower())
        filename_guid = str(uuid.uuid1())

        content_type = part.get_content_type()

        #filePath = "{}/attachments/{}{}".format(out_dir, filename_guid, extension)        
        # #save attachment
        # fp = open(filePath, 'wb')
        # fp.write(part.get_payload(decode=True))
        # fp.close()

        bstr = part.get_payload(decode=True)
        b64=''
        if bstr:
            b64 = base64.b64encode(bstr)

        attach.append({ "filename" : fileName,
                        "guid" : filename_guid,
                        "extension" : extension,
                        "filesize": len(bstr) if bstr else 0,
                        "contents64" : b64,
                        "content_type" : content_type
                        })

    # writes raw message to txt file
    #spit("{}/{}.txt".format(_dir, email_id), msg)

    try:
        row = createRow(email_id, message, attach, msg, body_type, categories)
    except Exception as e:
        print "Failed to process message: {} Exception:".format(email_id, e)
        print traceback.format_exc()
        raise e

    return row