def traverse_multipart(msg: email.message.Message, counter: int = 0, include_attachment_data: bool = False) -> \ typing.Dict[str, typing.Any]: """Recursively traverses all e-mail message multi-part elements and returns in a parsed form as a dict. Args: msg (email.message.Message): An e-mail message object. counter (int, optional): A counter which is used for generating attachments file-names in case there are none found in the header. Default = 0. include_attachment_data (bool, optional): If true, method includes the raw attachment data when returning. Default = False. Returns: dict: Returns a dict with all original multi-part headers as well as generated hash check-sums, date size, file extension, real mime-type. """ attachments = {} if msg.is_multipart(): if 'content-type' in msg: if msg.get_content_type() == 'message/rfc822': # This is an e-mail message attachment, add it to the attachment list apart from parsing it attachments.update( prepare_multipart_part_attachment( msg, counter, include_attachment_data)) # type: ignore for part in msg.get_payload(): # type: ignore attachments.update( traverse_multipart(part, counter, include_attachment_data)) # type: ignore else: return prepare_multipart_part_attachment(msg, counter, include_attachment_data) return attachments
def traverse_multipart(self, msg: email.message.Message, counter: int = 0) -> typing.Dict[str, typing.Any]: """Recursively traverses all e-mail message multi-part elements and returns in a parsed form as a dict. Args: msg (email.message.Message): An e-mail message object. counter (int, optional): A counter which is used for generating attachments file-names in case there are none found in the header. Default = 0. Returns: dict: Returns a dict with all original multi-part headers as well as generated hash check-sums, date size, file extension, real mime-type. """ attachments = {} if msg.is_multipart(): if 'content-type' in msg: if msg.get_content_type() == 'message/rfc822': # This is an e-mail message attachment, add it to the attachment list apart from parsing it attachments.update( self.prepare_multipart_part_attachment(msg, counter)) for part in msg.get_payload(): attachments.update(self.traverse_multipart(part, counter)) else: return self.prepare_multipart_part_attachment(msg, counter) return attachments
def traverse_multipart(msg: email.message.Message, counter: int = 0, include_attachment_data: bool = False) -> \ typing.Dict[str, typing.Any]: """Recursively traverses all e-mail message multi-part elements and returns in a parsed form as a dict. Args: msg (email.message.Message): An e-mail message object. counter (int, optional): A counter which is used for generating attachments file-names in case there are none found in the header. Default = 0. include_attachment_data (bool, optional): If true, method includes the raw attachment data when returning. Default = False. Returns: dict: Returns a dict with all original multi-part headers as well as generated hash check-sums, date size, file extension, real mime-type. """ attachments = {} if msg.is_multipart(): if 'content-type' in msg: if msg.get_content_type() == 'message/rfc822': # This is an e-mail message attachment, add it to the attachment list apart from parsing it attachments.update( prepare_multipart_part_attachment(msg, counter, include_attachment_data)) # type: ignore for part in msg.get_payload(): # type: ignore attachments.update(traverse_multipart(part, counter, include_attachment_data)) # type: ignore else: return prepare_multipart_part_attachment(msg, counter, include_attachment_data) return attachments
def prepare_multipart_part_attachment( msg: email.message.Message, counter: int = 0, include_attachment_data: bool = False) -> typing.Dict[str, typing.Any]: """Extract meta-information from a multipart-part. Args: msg (email.message.Message): An e-mail message object. counter (int, optional): A counter which is used for generating attachments file-names in case there are none found in the header. Default = 0. include_attachment_data (bool, optional): If true, method includes the raw attachment data when returning. Default = False. Returns: dict: Returns a dict with original multi-part headers as well as generated hash check-sums, date size, file extension, real mime-type. """ attachment = {} # In case we hit bug 27257, try to downgrade the used policy try: lower_keys = dict((k.lower(), v) for k, v in msg.items()) except AttributeError: former_policy = msg.policy msg.policy = email.policy.compat32 lower_keys = dict((k.lower(), v) for k, v in msg.items()) msg.policy = former_policy if ('content-disposition' in lower_keys and msg.get_content_disposition() != 'inline') \ or msg.get_content_maintype() != 'text': # if it's an attachment-type, pull out the filename # and calculate the size in bytes if msg.get_content_type() == 'message/rfc822': payload = msg.get_payload() if len(payload) > 1: logger.warning( 'More than one payload for "message/rfc822" part detected. This is not supported, please report!' ) data = bytes(payload[0]) file_size = len(data) else: data = msg.get_payload( decode=True) # type: bytes # type is always bytes here file_size = len(data) filename = msg.get_filename('') if filename == '': filename = 'part-{0:03d}'.format(counter) else: filename = eml_parser.decode.decode_field(filename) file_id = str(uuid.uuid1()) attachment[file_id] = {} attachment[file_id]['filename'] = filename attachment[file_id]['size'] = file_size # os.path always returns the extension as second element # in case there is no extension it returns an empty string extension = os.path.splitext(filename)[1].lower() if extension: # strip leading dot attachment[file_id]['extension'] = extension[1:] attachment[file_id]['hash'] = get_file_hash(data) if not (magic_mime is None or magic_none is None): mime_type = magic_none.buffer(data) mime_type_short = magic_mime.buffer(data) if not (mime_type is None or mime_type_short is None): attachment[file_id]['mime_type'] = mime_type # attachments[file_id]['mime_type_short'] = attachments[file_id]['mime_type'].split(",")[0] attachment[file_id]['mime_type_short'] = mime_type_short else: logger.warning( 'Error determining attachment mime-type - "{}"'.format( file_id)) if include_attachment_data: attachment[file_id]['raw'] = base64.b64encode(data) ch = {} # type: typing.Dict[str, typing.List[str]] for k, v in msg.items(): k = k.lower() v = str(v) if k in ch: ch[k].append(v) else: ch[k] = [v] attachment[file_id]['content_header'] = ch counter += 1 return attachment
def prepare_multipart_part_attachment(msg: email.message.Message, counter: int = 0, include_attachment_data: bool = False) -> typing.Dict[str, typing.Any]: """Extract meta-information from a multipart-part. Args: msg (email.message.Message): An e-mail message object. counter (int, optional): A counter which is used for generating attachments file-names in case there are none found in the header. Default = 0. include_attachment_data (bool, optional): If true, method includes the raw attachment data when returning. Default = False. Returns: dict: Returns a dict with original multi-part headers as well as generated hash check-sums, date size, file extension, real mime-type. """ attachment = {} # In case we hit bug 27257, try to downgrade the used policy try: lower_keys = dict((k.lower(), v) for k, v in msg.items()) except AttributeError: former_policy = msg.policy msg.policy = email.policy.compat32 lower_keys = dict((k.lower(), v) for k, v in msg.items()) msg.policy = former_policy if 'content-disposition' in lower_keys or not msg.get_content_maintype() == 'text': # if it's an attachment-type, pull out the filename # and calculate the size in bytes if msg.get_content_type() == 'message/rfc822': payload = msg.get_payload() if len(payload) > 1: logger.warning( 'More than one payload for "message/rfc822" part detected. This is not supported, please report!') data = bytes(payload[0]) file_size = len(data) else: data = msg.get_payload(decode=True) # type: bytes # type is always bytes here file_size = len(data) filename = msg.get_filename('') if filename == '': filename = 'part-{0:03d}'.format(counter) else: filename = eml_parser.decode.decode_field(filename) file_id = str(uuid.uuid1()) attachment[file_id] = {} attachment[file_id]['filename'] = filename attachment[file_id]['size'] = file_size # os.path always returns the extension as second element # in case there is no extension it returns an empty string extension = os.path.splitext(filename)[1].lower() if extension: # strip leading dot attachment[file_id]['extension'] = extension[1:] attachment[file_id]['hash'] = get_file_hash(data) if not (magic_mime is None or magic_none is None): mime_type = magic_none.buffer(data) mime_type_short = magic_mime.buffer(data) if not (mime_type is None or mime_type_short is None): attachment[file_id]['mime_type'] = mime_type # attachments[file_id]['mime_type_short'] = attachments[file_id]['mime_type'].split(",")[0] attachment[file_id]['mime_type_short'] = mime_type_short else: logger.warning('Error determining attachment mime-type - "{}"'.format(file_id)) if include_attachment_data: attachment[file_id]['raw'] = base64.b64encode(data) ch = {} # type: typing.Dict[str, typing.List[str]] for k, v in msg.items(): k = k.lower() v = str(v) if k in ch: ch[k].append(v) else: ch[k] = [v] attachment[file_id]['content_header'] = ch counter += 1 return attachment