def test_pffarchive_format_message(enron_dataset_part004, empty_message): for pst_file in enron_dataset_part004.glob("*.pst"): with PffArchive(pst_file) as archive: for message in archive.messages(): # The assertion here doesn't matter as much as # not getting an exception from python's email parsing module assert email.message_from_string( archive.format_message(message), policy=policy.default ) or not archive.format_message(message) assert PffArchive.format_message(empty_message) == ""
def test_extract_message_attachments(enron_dataset_part002): """Checking 3 known attachments, to validate the attachment extraction process """ digests = { 47685: "d48232614b01e56014293854abbb5db3", 47717: "cf8be7cd3e6e14307972246e2942c9d1", 47749: "081e6b66dc89671ff6460adac94dbab1", } with PffArchive(next(enron_dataset_part002.glob( "*.pst"))) as archive, TemporaryDirectory() as tmp_dir: # Get message by ID node = archive.tree.get_node(2128676) message = node.data for att in message.attachments: # Read attachment as bytes rbuf = att.read_buffer(att.size) # Save attachment filepath = (Path(tmp_dir) / f"attachment_{message.identifier}_{att.identifier}") filepath.write_bytes(rbuf) # Confirm checksum assert hashlib.md5(rbuf).hexdigest() == digests[att.identifier] # Sanity check on the file assert filepath.stat().st_size == att.size
def test_get_message_by_id(sample_pst_file): with PffArchive(sample_pst_file) as archive: for message in archive.messages(): msg = archive.get_message_by_id(message.identifier) assert msg.identifier == message.identifier assert archive.format_message(msg) == archive.format_message( message)
def test_pff_archive_with_bad_folders(sample_pst_file): with PffArchive(sample_pst_file) as archive: with patch.object(archive, "folders") as mock_folders: mock_folders.return_value = [BadPffFolder()] # No uncaught exception assert archive.message_count == 0 assert not list(archive.messages())
def load_pst(filename): mailbox_path = Path("..") print(mailbox_path.absolute()) report = { 'Files': 0, 'Messages': 0, 'Attachments': 0, 'Size': 0, 'Errors': 0 } # Start displaying results print(sorted(mailbox_path.glob("*"))) files = sorted(mailbox_path.glob('**/' + filename)) print(files) identifier_set = Counter() # Iterate over files with tqdm(total=len(files), desc="Files read", unit="files", leave=True) as file_bar: for pst_file in files: try: # Iterate over messages with PffArchive(pst_file) as archive: print(archive) for message in archive.messages(): try: # Do something with the message... emails = re.findall("<[^<>]*@{1}[^<>]*>", message.transport_headers) for each in emails: identifier_set[each[1:-1]] += 1 print() # Update report report['Messages'] += 1 report[ 'Attachments'] += message.number_of_attachments # Refresh report widget every 100 messages except Exception as exc: # Log error and move on to the next message report['Errors'] += 1 except Exception as exc: # Log error and move on to the next file print(exc) print("except") report['Errors'] += 1 # Update report report['Files'] += 1 report['Size'] += pst_file.stat().st_size return report
def test_extract_enron_messages_from_file(enron_dataset_file): """Similar to test_extract_enron_messages but with parametrized fixture""" try: # Iterate over messages and copy message string with PffArchive(enron_dataset_file) as archive: for message in archive.messages(): _ = archive.format_message(message) except Exception as exc: # pylint: disable=broad-except logger.info(f"Inspecting {enron_dataset_file}") logger.exception(exc)
def test_get_transport_headers_from_sent_items(enron_dataset_part004): for pst_file in enron_dataset_part004.glob("*.pst"): with PffArchive(pst_file) as archive: for folder in archive.folders(): try: name = folder.name.lower() except AttributeError: # pylint: disable=no-member if folder.identifier != archive._data.root_folder.identifier: raise continue if "sent mail" in name or "sent items" in name: for message in folder.sub_messages: assert message.transport_headers
def load_pst(self, filename): mailbox_path = Path() report = { 'Files': 0, 'Messages': 0, 'Attachments': 0, 'Size': 0, 'Errors': 0 } # Start displaying results files = sorted(mailbox_path.glob(filename)) identifier_set = Counter() # Iterate over files for pst_file in files: try: # Iterate over messages with PffArchive(pst_file) as archive: for message in archive.messages(): try: identifiers = re.findall( "<[^<>]*@{1}[^<>]*>", message.transport_headers) for each in identifiers: identifier_set[each[1:-1]] += 1 # Update report report['Messages'] += 1 report[ 'Attachments'] += message.number_of_attachments except Exception as exc: # Log error and move on to the next message report['Errors'] += 1 except Exception as exc: # Log error and move on to the next file report['Errors'] += 1 # Update report report['Files'] += 1 report['Size'] += pst_file.stat().st_size self.identifiers = set(identifier_set.keys()) self.identifiers.add("*****@*****.**")
def test_extract_enron_messages(enron_dataset): nb_extracted = 0 total_size = 0 for pst_file in enron_dataset.glob("**/*.pst"): try: # Iterate over messages and copy message string with PffArchive(pst_file) as archive: for message in archive.messages(): _ = archive.format_message(message) # Increment message count nb_extracted += 1 # Add file size to running total total_size += pst_file.stat().st_size except Exception as exc: # pylint: disable=broad-except logger.info(f"Inspecting {pst_file}") logger.exception(exc) logger.info( f"Extracted {nb_extracted} messages from a total of {humanfriendly.format_size(total_size)}" )
def test_pffarchive_iterate_over_messages(sample_pst_file, bfs): with PffArchive(sample_pst_file) as archive: for message in archive.messages(bfs=bfs): assert message.plain_text_body
def test_pffarchive_load_from_invalid_type(): with pytest.raises(TypeError): _ = PffArchive(1)
def test_pffarchive_load_from_file_object(sample_pst_file): with sample_pst_file.open(mode="rb") as f, PffArchive(f) as archive: assert len(list(archive.messages())) == 2668
def test_get_message_body(message, body_type): assert PffArchive().get_message_body(message)[1] is body_type
def test_get_attachment_metadata(mock_cls): message = MagicMock(identifier=123, attachments=[mock_cls(name="foo", size="0")]) assert PffArchive().get_attachment_metadata(message)[0].mime_type is None
def test_get_message_by_id_with_bad_id(sample_pst_file): with PffArchive(sample_pst_file) as archive: assert archive.get_message_by_id(1234) is None