def test_decompress(self): """ Test decompression """ data = '-\x00\x00\x00+\x00\x00\x00LZFu\xf1\xc5\xc7\xa7\x03\x00\n\x00' \ 'rcpg125B2\n\xf3 hel\t\x00 bw\x05\xb0ld}\n\x80\x0f\xa0' self.assertEqual(decompress(data), '{\\rtf1\\ansi\\ansicpg1252\\pard hello world}\r\n') # test raw decompression data = '.\x00\x00\x00"\x00\x00\x00MELA \xdf\x12\xce{\\rtf1\\ansi\\an' \ 'sicpg1252\\pard test}' self.assertEqual(decompress(data), '{\\rtf1\\ansi\\ansicpg1252\\pard test}')
def test_decompress(self): """ Test decompression """ data = '-\x00\x00\x00+\x00\x00\x00LZFu\xf1\xc5\xc7\xa7\x03\x00\n\x00' \ 'rcpg125B2\n\xf3 hel\t\x00 bw\x05\xb0ld}\n\x80\x0f\xa0' self.assertEqual( decompress(data), '{\\rtf1\\ansi\\ansicpg1252\\pard hello world}\r\n') # test raw decompression data = '.\x00\x00\x00"\x00\x00\x00MELA \xdf\x12\xce{\\rtf1\\ansi\\an' \ 'sicpg1252\\pard test}' self.assertEqual( decompress(data), '{\\rtf1\\ansi\\ansicpg1252\\pard test}')
def email_message(self, policy=email.policy.default): if self.mfs.get('PidTagCodepage'): charset = str(Codepage(self.mfs.get('PidTagCodepage'))) email_message = message_from_string(self.mfs.get('PidTagHeader'), policy=policy) email_message.clear_content() if self.mfs.get('PidTagBody'): email_message.add_alternative(self.mfs.get('PidTagBody'), charset=charset, subtype='plain') if self.mfs.get('PidTagBodyHtml'): email_message.add_alternative( self.mfs.get('PidTagBodyHtml').encode('utf-8'), maintype='text', subtype='html') if self.mfs.get('PidTagRtfCompressed'): email_message.add_alternative(compressed_rtf.decompress( self.mfs.get('PidTagRtfCompressed')), maintype='application', subtype='rtf') for attachment in self.attachments: with attachment.open() as fh: email_message.add_attachment(fh.read(), maintype='application', subtype='octet-stream', filename=attachment.filename) return email_message
def processFile(filepath, filename, prefix='', _canceler=canceler.FAKE): messa = extract_msg.Message(filepath, prefix, Attachment) # Create Message object. header = getHeader(messa) # Gets the header. formatted = getFormattedHeader( header) # The next few lines format the header headerString = '<p class=MsoNormal>' for u in headerVals: headerString = headerString + formatted[u] headerString = headerString + '</p><br>' ofilename = filename filename = mkdir(filename) os.chdir(filename) messa.save_attachments(True) # Saves the attatchments if messa.htmlBody != None: # Has html body? with open('output.html', 'wb') as o: o.write(messa.htmlBody) else: rtfContents = compressed_rtf.decompress( messa.compressedRtf ) # Read contents, decompress them, and store them in a variable. with open('out.rtf', 'wb') as rtfFile: rtfFile.write(rtfContents) callNode(ofilename, filepath, filename) if not debug: os.remove('out.rtf') addHeader(headerString) embedImages() callToPdf(filename) if not debug: os.remove('output.html') os.chdir('..') # Move back to the parent directory
def load_message_stream(self, entry, is_top_level, doc): # Load stream data. props = None props = self.__parse_properties(entry['__properties_version1.0'], is_top_level, entry, doc) # Construct the MIME message.... self._MIME_MSG = None self._MIME_MSG = email.message.EmailMessage() # Add the raw headers, if known. if 'TRANSPORT_MESSAGE_HEADERS' in props: self.__add_transport_headers(props) else: props = self.__add_common_headers(props) # Add the plain-text body from the BODY field. if 'BODY' in props: body = props['BODY'] if isinstance(body, str): self._MIME_MSG.set_content(body, cte='quoted-printable') else: self._MIME_MSG.set_content(body, maintype="text", subtype="plain", cte='8bit') # Plain-text is not availabe. Use the rich text version. else: doc.rtf_attachments += 1 fn = "messagebody_{}.rtf".format(doc.rtf_attachments) self._MIME_MSG.set_content( "<no plain text message body --- see attachment {}>".format( fn), cte='quoted-printable') # Decompress the value to Rich Text Format. from compressed_rtf import compress, decompress rtf = props['RTF_COMPRESSED'] rtf = decompress(rtf) # Add RTF file as an attachment. self._MIME_MSG.add_attachment(rtf, maintype="text", subtype="rtf", filename=fn) # # Copy over string values of remaining properties as headers # # so we don't lose any information. # for k, v in props.items(): # if k == 'RTF_COMPRESSED': continue # not interested, save output # msg[k] = str(v) # Add attachments. for stream in entry: if stream.name.startswith("__attach_version1.0_#"): self.__process_attachment(stream, doc) return self._MIME_MSG
def save_data(in_filename, out_dir, titles, data, meta_data, compressed_rtfs): """Save found data to `out_dir`""" # make out dir if not exists if not os.path.exists(out_dir): os.makedirs(out_dir) # decode compressed RTF for index, compressed_rtf_data in enumerate(compressed_rtfs): if not compressed_rtf_data: continue filename = '{}_data_{}.rtf'.format(in_filename, index) print('Saving decompressed RTF data to: {}'.format(filename)) out_data = compressed_rtf.decompress(compressed_rtf_data) with open(os.path.join(out_dir, filename), 'wb') as out_file: out_file.write(out_data) # save attachments and meta files for index, title in enumerate(titles): filename = title.decode('utf-8') print('Saving attachment to: {}'.format(filename)) with open(os.path.join(out_dir, filename), 'wb') as out_file: out_file.write(data[index]) meta_filename = '{}_meta_{}.raw'.format(title.decode('utf-8'), index) print('Saving attachment meta file to: {}'.format(meta_filename)) with open(os.path.join(out_dir, meta_filename), 'wb') as out_file: out_file.write(meta_data[index])
def test_hither_and_thither_long(self): """ Test decompression of compressed data larger than 4096 """ data = '{\\rtf1\\ansi\\ansicpg1252\\pard hello world' while len(data) < 4096: data += "testtest" data += "}" self.assertEqual(decompress(compress(data, compressed=True)), data)
def rtfbody(self): if self._rtfbody: try: from compressed_rtf import decompress return decompress(self._rtfbody + b'\x00') except ImportError: logger.warning("Returning compressed RTF. Install compressed_rtf to decompress") return self._rtfbody else: return None
def run(config: config_loader.Config) -> None: if config.cli_args.file and config.cli_args.file.exists(): file_name = config.cli_args.file.name with open(config.cli_args.file, mode="rb") as rtf_file: rp = Rtf_Parser(rtf_file=rtf_file) rp.parse_file() elif config.cli_args.msg: file_name = config.cli_args.msg.name msg = em.openMsg(f"{config.cli_args.msg}") for attachment in msg.attachments: with open(config.html / f"{attachment.longFilename}", mode="wb") as att_file: att_file.write(attachment.data) decompressed_rtf = cr.decompress(msg.compressedRtf) with open((config.email_rtf / config.cli_args.msg.name).with_suffix(".rtf"), mode="wb") as email_rtf: email_rtf.write(decompressed_rtf) with io.BytesIO(decompressed_rtf) as rtf_file: rp = Rtf_Parser(rtf_file=rtf_file) rp.parse_file() if config.cli_args.de_encapsulate_html: de_encapsulate(rp, (config.html / file_name).with_suffix(".html"))
def test_hither_and_thither(self): """ Test decompression of compressed data """ data = '{\\rtf1\\ansi\\mac\\deff0\\deftab720' self.assertEqual(decompress(compress(data, compressed=True)), data)
def _set_properties(self): property_values = self._message.properties # setting generally required properties to easily access using MsOxMessage instance. self.subject = property_values.get("Subject") header = property_values.get("TransportMessageHeaders") self.header = parse_email_headers(header, True) self.header_dict = parse_email_headers(header) or {} self.created_date = property_values.get("CreationTime") self.received_date = property_values.get("ReceiptTime") sent_date = property_values.get("DeliverTime") if not sent_date: sent_date = self.header_dict.get("Date") self.sent_date = sent_date sender_address = self.header_dict.get("From") if not sender_address: sender_address = property_values.get( "SenderRepresentingSmtpAddress") self.sender = sender_address reply_to_address = self.header_dict.get("Reply-To") if not reply_to_address: reply_to_address = property_values.get("ReplyRecipientNames") self.reply_to = reply_to_address self.message_id = property_values.get("InternetMessageId") to_address = self.header_dict.get("TO") if not to_address: to_address = property_values.get("DisplayTo") if not to_address: to_address = property_values.get( "ReceivedRepresentingSmtpAddress") self.to = to_address cc_address = self.header_dict.get("CC") # if cc_address: # cc_address = [CONTROL_CHARS.sub(" ", cc_add) for cc_add in cc_address.split(",")] self.cc = cc_address bcc_address = self.header_dict.get("BCC") self.bcc = bcc_address # prefer HTMl over plain text if "Html" in property_values: self.body = property_values.get("Html") else: self.body = property_values.get("Body") if not self.body and "RtfCompressed" in property_values: try: import compressed_rtf except ImportError: compressed_rtf = None if compressed_rtf: compressed_rtf_body = property_values['RtfCompressed'] self.body = compressed_rtf.decompress(compressed_rtf_body)
def load_message_stream(entry: CompoundFileEntity, is_top_level: bool, doc: CompoundFileReader): # Load stream data. props = parse_properties(entry["__properties_version1.0"], is_top_level, entry, doc) # Construct the MIME message.... msg = email.message.EmailMessage() # Add the raw headers, if known. if "TRANSPORT_MESSAGE_HEADERS" in props: # Get the string holding all of the headers. headers = props["TRANSPORT_MESSAGE_HEADERS"] if isinstance(headers, bytes): headers = headers.decode("utf-8") # Remove content-type header because the body we can get this # way is just the plain-text portion of the email and whatever # Content-Type header was in the original is not valid for # reconstructing it this way. headers = re.sub("Content-Type: .*(\n\\s.*)*\n", "", headers, re.I) # Parse them. headers = email.parser.HeaderParser( policy=email.policy.default).parsestr(headers) # Copy them into the message object. for header, value in headers.items(): msg[header] = value else: # Construct common headers from metadata. if "MESSAGE_DELIVERY_TIME" in props: msg["Date"] = formatdate( props["MESSAGE_DELIVERY_TIME"].timestamp()) del props["MESSAGE_DELIVERY_TIME"] if "SENDER_NAME" in props: if "SENT_REPRESENTING_NAME" in props: if props["SENT_REPRESENTING_NAME"]: if props["SENDER_NAME"] != props["SENT_REPRESENTING_NAME"]: props["SENDER_NAME"] += ( " (" + props["SENT_REPRESENTING_NAME"] + ")") del props["SENT_REPRESENTING_NAME"] if props["SENDER_NAME"]: msg["From"] = formataddr((props["SENDER_NAME"], "")) del props["SENDER_NAME"] if "DISPLAY_TO" in props: if props["DISPLAY_TO"]: msg["To"] = props["DISPLAY_TO"] del props["DISPLAY_TO"] if "DISPLAY_CC" in props: if props["DISPLAY_CC"]: msg["CC"] = props["DISPLAY_CC"] del props["DISPLAY_CC"] if "DISPLAY_BCC" in props: if props["DISPLAY_BCC"]: msg["BCC"] = props["DISPLAY_BCC"] del props["DISPLAY_BCC"] if "SUBJECT" in props: if props["SUBJECT"]: msg["Subject"] = props["SUBJECT"] del props["SUBJECT"] # Add the plain-text body from the BODY field. if "BODY" in props: body = props["BODY"] if isinstance(body, str): msg.set_content(body, cte="quoted-printable") else: msg.set_content(body, maintype="text", subtype="plain", cte="8bit") # Plain-text is not availabe. Use the rich text version. else: doc.rtf_attachments += 1 fn = f"messagebody_{doc.rtf_attachments}.rtf" msg.set_content( f"<no plain text message body --- see attachment {fn}>", cte="quoted-printable", ) # Decompress the value to Rich Text Format. rtf = props["RTF_COMPRESSED"] rtf = compressed_rtf.decompress(rtf) # Add RTF file as an attachment. msg.add_attachment(rtf, maintype="text", subtype="rtf", filename=fn) # # Copy over string values of remaining properties as headers # # so we don't lose any information. # for k, v in props.items(): # if k == 'RTF_COMPRESSED': continue # not interested, save output # msg[k] = str(v) # Add attachments. for stream in entry: if stream.name.startswith("__attach_version1.0_#"): process_attachment(msg, stream, doc) return msg
def rtfBody(self): return decompress(self.compressedRtf)
def _set_properties(self): property_values = self._message.properties # setting generally required properties to easily access using MsOxMessage instance. self.subject = property_values.get("Subject") header = property_values.get("TransportMessageHeaders") self.header = parse_email_headers(header, True) self.header_dict = parse_email_headers(header) or {} self.created_date = property_values.get("CreationTime") self.received_date = property_values.get("ReceiptTime") sent_date = property_values.get("DeliverTime") if not sent_date: sent_date = self.header_dict.get("Date") self.sent_date = sent_date sender_address = self.header_dict.get("From") if not sender_address: sender_address = property_values.get("SenderRepresentingSmtpAddress") self.sender = sender_address reply_to_address = self.header_dict.get("Reply-To") if not reply_to_address: reply_to_address = property_values.get("ReplyRecipientNames") self.reply_to = reply_to_address self.message_id = property_values.get("InternetMessageId") to_address = self.header_dict.get("TO") if not to_address: to_address = property_values.get("DisplayTo") if not to_address: to_address = property_values.get("ReceivedRepresentingSmtpAddress") self.to = to_address cc_address = self.header_dict.get("CC") # if cc_address: # cc_address = [CONTROL_CHARS.sub(" ", cc_add) for cc_add in cc_address.split(",")] self.cc = cc_address bcc_address = self.header_dict.get("BCC") self.bcc = bcc_address # prefer HTMl over plain text if "Html" in property_values: self.body = property_values.get("Html") else: self.body = property_values.get("Body") # Trying to decode body if is bytes obj. This is not the way to go. Quick-fix only. # See IMAP specs. Use charset-normalizer, cchardet or chardet as last resort. if isinstance(self.body, bytes): self.body = self.body.decode("utf-8", "ignore") if not self.body and "RtfCompressed" in property_values: try: import compressed_rtf except ImportError: compressed_rtf = None if compressed_rtf: compressed_rtf_body = property_values["RtfCompressed"] self.body = compressed_rtf.decompress(compressed_rtf_body)
def load_message_stream(entry, is_top_level, doc): # Load stream data. props = parse_properties(entry['__properties_version1.0'], is_top_level, entry, doc) # Construct the MIME message.... msg = email.message.EmailMessage() # Add the raw headers, if known. if 'TRANSPORT_MESSAGE_HEADERS' in props: # Get the string holding all of the headers. headers = props['TRANSPORT_MESSAGE_HEADERS'] if isinstance(headers, bytes): headers = headers.decode("utf-8") # Remove content-type header because the body we can get this # way is just the plain-text portion of the email and whatever # Content-Type header was in the original is not valid for # reconstructing it this way. headers = re.sub("Content-Type: .*(\n\s.*)*\n", "", headers, re.I) # Parse them. headers = email.parser.HeaderParser(policy=email.policy.default)\ .parsestr(headers) # Copy them into the message object. for header, value in headers.items(): msg[header] = value else: # Construct common headers from metadata. msg['Date'] = formatdate(props['MESSAGE_DELIVERY_TIME'].timestamp()) del props['MESSAGE_DELIVERY_TIME'] if props['SENDER_NAME'] != props['SENT_REPRESENTING_NAME']: props[ 'SENDER_NAME'] += " (" + props['SENT_REPRESENTING_NAME'] + ")" del props['SENT_REPRESENTING_NAME'] msg['From'] = formataddr((props['SENDER_NAME'], "")) del props['SENDER_NAME'] msg['To'] = props['DISPLAY_TO'] del props['DISPLAY_TO'] msg['CC'] = props['DISPLAY_CC'] del props['DISPLAY_CC'] msg['BCC'] = props['DISPLAY_BCC'] del props['DISPLAY_BCC'] msg['Subject'] = props['SUBJECT'] del props['SUBJECT'] # Add the plain-text body from the BODY field. if 'BODY' in props: body = props['BODY'] if isinstance(body, str): msg.set_content(body, cte='quoted-printable') else: msg.set_content(body, maintype="text", subtype="plain", cte='8bit') # Plain-text is not availabe. Use the rich text version. else: doc.rtf_attachments += 1 fn = "messagebody_{}.rtf".format(doc.rtf_attachments) msg.set_content( "<no plain text message body --- see attachment {}>".format(fn), cte='quoted-printable') # Decompress the value to Rich Text Format. import compressed_rtf rtf = props['RTF_COMPRESSED'] rtf = compressed_rtf.decompress(rtf) # Add RTF file as an attachment. msg.add_attachment(rtf, maintype="text", subtype="rtf", filename=fn) # # Copy over string values of remaining properties as headers # # so we don't lose any information. # for k, v in props.items(): # if k == 'RTF_COMPRESSED': continue # not interested, save output # msg[k] = str(v) # Add attachments. for stream in entry: if stream.name.startswith("__attach_version1.0_#"): process_attachment(msg, stream, doc) return msg
def rtfBody(self): """ Returns the decompressed Rtf body from the message. """ return compressed_rtf.decompress(self.compressedRtf)