示例#1
0
    def get_raw_body_text(self, msg: email.message.Message) -> typing.List[typing.Tuple[typing.Any, typing.Any, typing.Any]]:
        """This method recursively retrieves all e-mail body parts and returns them as a list.

        Args:
            msg (email.message.Message): The actual e-mail message or sub-message.

        Returns:
            list: Returns a list of sets which are in the form of "set(encoding, raw_body_string, message field headers)"
        """
        raw_body: typing.List[typing.Tuple[typing.Any, typing.Any, typing.Any]] = []

        if msg.is_multipart():
            for part in msg.get_payload():
                raw_body.extend(self.get_raw_body_text(part))
        else:
            # Treat text document attachments as belonging to the body of the mail.
            # Attachments with a file-extension of .htm/.html are implicitly treated
            # as text as well in order not to escape later checks (e.g. URL scan).

            try:
                filename = msg.get_filename('').lower()
            except (binascii.Error, AssertionError):
                logger.exception(
                    'Exception occurred while trying to parse the content-disposition header. Collected data will not be complete.')
                filename = ''

            # pylint: disable=too-many-boolean-expressions
            if ('content-disposition' not in msg and msg.get_content_maintype() == 'text') \
                or (filename.endswith('.html') or filename.endswith('.htm')) \
                or ('content-disposition' in msg and msg.get_content_disposition() == 'inline'
                    and msg.get_content_maintype() == 'text'):
                encoding = msg.get('content-transfer-encoding', '').lower()

                charset = msg.get_content_charset()
                if charset is None:
                    raw_body_str = msg.get_payload(decode=True)
                    raw_body_str = eml_parser.decode.decode_string(raw_body_str, None)
                else:
                    try:
                        raw_body_str = msg.get_payload(decode=True).decode(charset, 'ignore')
                    except (LookupError, ValueError):
                        logger.debug('An exception occurred while decoding the payload!', exc_info=True)
                        raw_body_str = msg.get_payload(decode=True).decode('ascii', 'ignore')

                # In case we hit bug 27257, try to downgrade the used policy
                try:
                    raw_body.append((encoding, raw_body_str, msg.items()))
                except AttributeError:
                    former_policy: email.policy.Policy = msg.policy  # type: ignore
                    msg.policy = email.policy.compat32  # type: ignore
                    raw_body.append((encoding, raw_body_str, msg.items()))
                    msg.policy = former_policy  # type: ignore

        return raw_body
示例#2
0
def prepare_multipart_part_attachment(
        msg: email.message.Message,
        counter: int = 0,
        include_attachment_data: bool = False) -> typing.Dict[str, typing.Any]:
    """Extract meta-information from a multipart-part.

    Args:
        msg (email.message.Message): An e-mail message object.
        counter (int, optional): A counter which is used for generating attachments
            file-names in case there are none found in the header. Default = 0.
        include_attachment_data (bool, optional): If true, method includes the raw attachment data when
            returning. Default = False.

    Returns:
        dict: Returns a dict with original multi-part headers as well as generated hash check-sums,
            date size, file extension, real mime-type.
    """
    attachment = {}

    # In case we hit bug 27257, try to downgrade the used policy
    try:
        lower_keys = dict((k.lower(), v) for k, v in msg.items())
    except AttributeError:
        former_policy = msg.policy
        msg.policy = email.policy.compat32
        lower_keys = dict((k.lower(), v) for k, v in msg.items())
        msg.policy = former_policy

    if ('content-disposition' in lower_keys and msg.get_content_disposition() != 'inline') \
            or msg.get_content_maintype() != 'text':
        # if it's an attachment-type, pull out the filename
        # and calculate the size in bytes
        if msg.get_content_type() == 'message/rfc822':
            payload = msg.get_payload()
            if len(payload) > 1:
                logger.warning(
                    'More than one payload for "message/rfc822" part detected. This is not supported, please report!'
                )

            data = bytes(payload[0])
            file_size = len(data)
        else:
            data = msg.get_payload(
                decode=True)  # type: bytes  # type is always bytes here
            file_size = len(data)

        filename = msg.get_filename('')
        if filename == '':
            filename = 'part-{0:03d}'.format(counter)
        else:
            filename = eml_parser.decode.decode_field(filename)

        file_id = str(uuid.uuid1())
        attachment[file_id] = {}
        attachment[file_id]['filename'] = filename
        attachment[file_id]['size'] = file_size

        # os.path always returns the extension as second element
        # in case there is no extension it returns an empty string
        extension = os.path.splitext(filename)[1].lower()
        if extension:
            # strip leading dot
            attachment[file_id]['extension'] = extension[1:]

        attachment[file_id]['hash'] = get_file_hash(data)

        if not (magic_mime is None or magic_none is None):
            mime_type = magic_none.buffer(data)
            mime_type_short = magic_mime.buffer(data)

            if not (mime_type is None or mime_type_short is None):
                attachment[file_id]['mime_type'] = mime_type
                # attachments[file_id]['mime_type_short'] = attachments[file_id]['mime_type'].split(",")[0]
                attachment[file_id]['mime_type_short'] = mime_type_short
            else:
                logger.warning(
                    'Error determining attachment mime-type - "{}"'.format(
                        file_id))

        if include_attachment_data:
            attachment[file_id]['raw'] = base64.b64encode(data)

        ch = {}  # type: typing.Dict[str, typing.List[str]]
        for k, v in msg.items():
            k = k.lower()
            v = str(v)

            if k in ch:
                ch[k].append(v)
            else:
                ch[k] = [v]

        attachment[file_id]['content_header'] = ch

        counter += 1

    return attachment