Exemplo n.º 1
0
def traverse_multipart(msg: email.message.Message, counter: int = 0, include_attachment_data: bool = False) -> \
        typing.Dict[str, typing.Any]:
    """Recursively traverses all e-mail message multi-part elements and returns in a parsed form as a dict.

    Args:
        msg (email.message.Message): An e-mail message object.
        counter (int, optional): A counter which is used for generating attachments
            file-names in case there are none found in the header. Default = 0.
        include_attachment_data (bool, optional): If true, method includes the raw attachment data when
            returning. Default = False.

    Returns:
        dict: Returns a dict with all original multi-part headers as well as generated hash check-sums,
            date size, file extension, real mime-type.
    """
    attachments = {}

    if msg.is_multipart():
        if 'content-type' in msg:
            if msg.get_content_type() == 'message/rfc822':
                # This is an e-mail message attachment, add it to the attachment list apart from parsing it
                attachments.update(
                    prepare_multipart_part_attachment(
                        msg, counter, include_attachment_data))  # type: ignore

        for part in msg.get_payload():  # type: ignore
            attachments.update(
                traverse_multipart(part, counter,
                                   include_attachment_data))  # type: ignore
    else:
        return prepare_multipart_part_attachment(msg, counter,
                                                 include_attachment_data)

    return attachments
Exemplo n.º 2
0
def get_raw_body_text(
    msg: email.message.Message
) -> typing.List[typing.Tuple[typing.Any, typing.Any, typing.Any]]:
    """This method recursively retrieves all e-mail body parts and returns them as a list.

    Args:
        msg (email.message.Message): The actual e-mail message or sub-message.

    Returns:
        list: Returns a list of sets which are in the form of "set(encoding, raw_body_string, message field headers)"
    """
    raw_body = [
    ]  # type: typing.List[typing.Tuple[typing.Any, typing.Any,typing.Any]]

    if msg.is_multipart():
        for part in msg.get_payload():  # type: ignore
            raw_body.extend(get_raw_body_text(part))  # type: ignore
    else:
        # Treat text document attachments as belonging to the body of the mail.
        # Attachments with a file-extension of .htm/.html are implicitely treated
        # as text as well in order not to escape later checks (e.g. URL scan).

        try:
            filename = msg.get_filename('').lower()
        except (binascii.Error, AssertionError):
            logger.exception(
                'Exception occured while trying to parse the content-disposition header. Collected data will not be complete.'
            )
            filename = ''

        if ('content-disposition' not in msg and msg.get_content_maintype() == 'text') \
            or (filename.endswith('.html') or filename.endswith('.htm')):
            encoding = msg.get('content-transfer-encoding', '').lower()

            charset = msg.get_content_charset()
            if charset is None:
                raw_body_str = msg.get_payload(decode=True)
                raw_body_str = eml_parser.decode.decode_string(
                    raw_body_str, None)
            else:
                try:
                    raw_body_str = msg.get_payload(decode=True).decode(
                        charset, 'ignore')
                except Exception:
                    logger.debug(
                        'An exception occured while decoding the payload!',
                        exc_info=True)
                    raw_body_str = msg.get_payload(decode=True).decode(
                        'ascii', 'ignore')

            # In case we hit bug 27257, try to downgrade the used policy
            try:
                raw_body.append((encoding, raw_body_str, msg.items()))
            except AttributeError:
                former_policy = msg.policy
                msg.policy = email.policy.compat32
                raw_body.append((encoding, raw_body_str, msg.items()))
                msg.policy = former_policy

    return raw_body
Exemplo n.º 3
0
    def traverse_multipart(self,
                           msg: email.message.Message,
                           counter: int = 0) -> typing.Dict[str, typing.Any]:
        """Recursively traverses all e-mail message multi-part elements and returns in a parsed form as a dict.

        Args:
            msg (email.message.Message): An e-mail message object.
            counter (int, optional): A counter which is used for generating attachments
                file-names in case there are none found in the header. Default = 0.

        Returns:
            dict: Returns a dict with all original multi-part headers as well as generated hash check-sums,
                date size, file extension, real mime-type.
        """
        attachments = {}

        if msg.is_multipart():
            if 'content-type' in msg:
                if msg.get_content_type() == 'message/rfc822':
                    # This is an e-mail message attachment, add it to the attachment list apart from parsing it
                    attachments.update(
                        self.prepare_multipart_part_attachment(msg, counter))

            for part in msg.get_payload():
                attachments.update(self.traverse_multipart(part, counter))
        else:
            return self.prepare_multipart_part_attachment(msg, counter)

        return attachments
Exemplo n.º 4
0
def traverse_multipart(msg: email.message.Message, counter: int = 0, include_attachment_data: bool = False) -> \
        typing.Dict[str, typing.Any]:
    """Recursively traverses all e-mail message multi-part elements and returns in a parsed form as a dict.

    Args:
        msg (email.message.Message): An e-mail message object.
        counter (int, optional): A counter which is used for generating attachments
            file-names in case there are none found in the header. Default = 0.
        include_attachment_data (bool, optional): If true, method includes the raw attachment data when
            returning. Default = False.

    Returns:
        dict: Returns a dict with all original multi-part headers as well as generated hash check-sums,
            date size, file extension, real mime-type.
    """
    attachments = {}

    if msg.is_multipart():
        if 'content-type' in msg:
            if msg.get_content_type() == 'message/rfc822':
                # This is an e-mail message attachment, add it to the attachment list apart from parsing it
                attachments.update(
                    prepare_multipart_part_attachment(msg, counter, include_attachment_data))  # type: ignore

        for part in msg.get_payload():  # type: ignore
            attachments.update(traverse_multipart(part, counter, include_attachment_data))  # type: ignore
    else:
        return prepare_multipart_part_attachment(msg, counter, include_attachment_data)

    return attachments
Exemplo n.º 5
0
 def __init__(self, message: email.message.Message):
     # I am assuming that all messages are sent as pgp-signed
     self._message = message
     self._body = None
     self._signature = None
     self._public_key = None
     if not message.is_multipart():
         raise NotMultipart(sender=self.sender_address)
     self._parse()
Exemplo n.º 6
0
def get_raw_body_text(msg: email.message.Message) -> typing.List[typing.Tuple[typing.Any, typing.Any, typing.Any]]:
    """This method recursively retrieves all e-mail body parts and returns them as a list.

    Args:
        msg (email.message.Message): The actual e-mail message or sub-message.

    Returns:
        list: Returns a list of sets which are in the form of "set(encoding, raw_body_string, message field headers)"
    """
    raw_body = []  # type: typing.List[typing.Tuple[typing.Any, typing.Any,typing.Any]]

    if msg.is_multipart():
        for part in msg.get_payload():  # type: ignore
            raw_body.extend(get_raw_body_text(part))  # type: ignore
    else:
        # Treat text document attachments as belonging to the body of the mail.
        # Attachments with a file-extension of .htm/.html are implicitely treated
        # as text as well in order not to escape later checks (e.g. URL scan).

        try:
            filename = msg.get_filename('').lower()
        except (binascii.Error, AssertionError):
            logger.exception(
                'Exception occured while trying to parse the content-disposition header. Collected data will not be complete.')
            filename = ''

        if ('content-disposition' not in msg and msg.get_content_maintype() == 'text') or (
                filename.endswith('.html') or filename.endswith('.htm')):
            encoding = msg.get('content-transfer-encoding', '').lower()

            charset = msg.get_content_charset()
            if charset is None:
                raw_body_str = msg.get_payload(decode=True)
                raw_body_str = eml_parser.decode.decode_string(raw_body_str, None)
            else:
                try:
                    raw_body_str = msg.get_payload(decode=True).decode(charset, 'ignore')
                except Exception:
                    logger.debug('An exception occured while decoding the payload!', exc_info=True)
                    raw_body_str = msg.get_payload(decode=True).decode('ascii', 'ignore')

            # In case we hit bug 27257, try to downgrade the used policy
            try:
                raw_body.append((encoding, raw_body_str, msg.items()))
            except AttributeError:
                former_policy = msg.policy
                msg.policy = email.policy.compat32
                raw_body.append((encoding, raw_body_str, msg.items()))
                msg.policy = former_policy

    return raw_body
Exemplo n.º 7
0
    def decode_body(message: email.message.Message):
        # If the message comes in multiple portions (i.e. there are attachments)
        if message.is_multipart():
            for part in message.get_payload():
                # Throw away attachments
                if part.get_filename():
                    continue

                # Return an html object or plaintext
                charset = part.get_content_charset()
                if part.get_content_type() == 'text/plain':
                    return part.get_payload(
                        decode=True).decode(charset).strip()
                if part.get_content_type() == 'text/html':
                    return html(part.get_payload(decode=True).decode(charset))
                # Note that we can just return because even though it's multipart, the only usual suspects for the multiple parts are extra attachments, which we're avoiding.
        else:
            return message.get_payload(decode=True).decode(
                message.get_content_charset()).strip()
Exemplo n.º 8
0
def traverse_multipart(
        msg: email.message.Message,
        counter: int = 0,
        include_attachment_data: bool = False) -> typing.Dict[str, typing.Any]:
    """Recursively traverses all e-mail message multi-part elements and returns in a parsed form as a dict.

    Args:
        msg (email.message.Message): An e-mail message object.
        counter (int, optional): A counter which is used for generating attachments
            file-names in case there are none found in the header. Default = 0.
        include_attachment_data (bool, optional): If true, method includes the raw attachment data when
            returning. Default = False.

    Returns:
        dict: Returns a dict with all original multi-part headers as well as generated hash check-sums,
            date size, file extension, real mime-type.
    """
    attachments = {}

    if msg.is_multipart():
        for part in msg.get_payload():  # type: ignore
            attachments.update(
                traverse_multipart(part, counter,
                                   include_attachment_data))  # type: ignore
    else:
        # In case we hit bug 27257, try to downgrade the used policy
        try:
            lower_keys = dict((k.lower(), v) for k, v in msg.items())
        except AttributeError:
            former_policy = msg.policy
            msg.policy = email.policy.compat32
            lower_keys = dict((k.lower(), v) for k, v in msg.items())
            msg.policy = former_policy

        if 'content-disposition' in lower_keys or not msg.get_content_maintype(
        ) == 'text':
            # if it's an attachment-type, pull out the filename
            # and calculate the size in bytes
            data = msg.get_payload(
                decode=True)  # type: bytes  # type is always bytes here
            file_size = len(data)

            filename = msg.get_filename('')
            if filename == '':
                filename = 'part-{0:03d}'.format(counter)
            else:
                filename = eml_parser.decode.decode_field(filename)

            file_id = str(uuid.uuid1())
            attachments[file_id] = {}
            attachments[file_id]['filename'] = filename
            attachments[file_id]['size'] = file_size

            # os.path always returns the extension as second element
            # in case there is no extension it returns an empty string
            extension = os.path.splitext(filename)[1].lower()
            if extension:
                # strip leading dot
                attachments[file_id]['extension'] = extension[1:]

            attachments[file_id]['hash'] = get_file_hash(data)

            if not (magic_mime is None or magic_none is None):
                attachments[file_id]['mime_type'] = magic_none.buffer(data)
                # attachments[file_id]['mime_type_short'] = attachments[file_id]['mime_type'].split(",")[0]
                attachments[file_id]['mime_type_short'] = magic_mime.buffer(
                    data)

            if include_attachment_data:
                attachments[file_id]['raw'] = base64.b64encode(data)

            ch = {}  # type: typing.Dict[str, typing.List[str]]
            for k, v in msg.items():
                k = k.lower()
                v = str(v)

                if k in ch:
                    # print "%s<<<>>>%s" % (k, v)
                    ch[k].append(v)
                else:
                    ch[k] = [v]

            attachments[file_id]['content_header'] = ch

            counter += 1

    return attachments