示例#1
0
def test_pffarchive_format_message(enron_dataset_part004, empty_message):

    for pst_file in enron_dataset_part004.glob("*.pst"):
        with PffArchive(pst_file) as archive:
            for message in archive.messages():
                # The assertion here doesn't matter as much as
                # not getting an exception from python's email parsing module
                assert email.message_from_string(
                    archive.format_message(message), policy=policy.default
                ) or not archive.format_message(message)

    assert PffArchive.format_message(empty_message) == ""
示例#2
0
def test_extract_message_attachments(enron_dataset_part002):
    """Checking 3 known attachments, to validate the attachment extraction process
    """

    digests = {
        47685: "d48232614b01e56014293854abbb5db3",
        47717: "cf8be7cd3e6e14307972246e2942c9d1",
        47749: "081e6b66dc89671ff6460adac94dbab1",
    }

    with PffArchive(next(enron_dataset_part002.glob(
            "*.pst"))) as archive, TemporaryDirectory() as tmp_dir:

        # Get message by ID
        node = archive.tree.get_node(2128676)
        message = node.data

        for att in message.attachments:
            # Read attachment as bytes
            rbuf = att.read_buffer(att.size)

            # Save attachment
            filepath = (Path(tmp_dir) /
                        f"attachment_{message.identifier}_{att.identifier}")
            filepath.write_bytes(rbuf)

            # Confirm checksum
            assert hashlib.md5(rbuf).hexdigest() == digests[att.identifier]

            # Sanity check on the file
            assert filepath.stat().st_size == att.size
示例#3
0
def test_get_message_by_id(sample_pst_file):
    with PffArchive(sample_pst_file) as archive:
        for message in archive.messages():
            msg = archive.get_message_by_id(message.identifier)
            assert msg.identifier == message.identifier
            assert archive.format_message(msg) == archive.format_message(
                message)
示例#4
0
def test_pff_archive_with_bad_folders(sample_pst_file):
    with PffArchive(sample_pst_file) as archive:
        with patch.object(archive, "folders") as mock_folders:
            mock_folders.return_value = [BadPffFolder()]

            # No uncaught exception
            assert archive.message_count == 0
            assert not list(archive.messages())
示例#5
0
def load_pst(filename):

    mailbox_path = Path("..")
    print(mailbox_path.absolute())

    report = {
        'Files': 0,
        'Messages': 0,
        'Attachments': 0,
        'Size': 0,
        'Errors': 0
    }

    # Start displaying results
    print(sorted(mailbox_path.glob("*")))
    files = sorted(mailbox_path.glob('**/' + filename))
    print(files)
    identifier_set = Counter()
    # Iterate over files
    with tqdm(total=len(files), desc="Files read", unit="files",
              leave=True) as file_bar:
        for pst_file in files:
            try:
                # Iterate over messages

                with PffArchive(pst_file) as archive:
                    print(archive)
                    for message in archive.messages():
                        try:
                            # Do something with the message...

                            emails = re.findall("<[^<>]*@{1}[^<>]*>",
                                                message.transport_headers)
                            for each in emails:
                                identifier_set[each[1:-1]] += 1
                            print()
                            # Update report
                            report['Messages'] += 1
                            report[
                                'Attachments'] += message.number_of_attachments

                            # Refresh report widget every 100 messages

                        except Exception as exc:
                            # Log error and move on to the next message
                            report['Errors'] += 1

            except Exception as exc:
                # Log error and move on to the next file
                print(exc)
                print("except")
                report['Errors'] += 1

            # Update report
            report['Files'] += 1
            report['Size'] += pst_file.stat().st_size
    return report
示例#6
0
def test_extract_enron_messages_from_file(enron_dataset_file):
    """Similar to test_extract_enron_messages but with parametrized fixture"""
    try:
        # Iterate over messages and copy message string
        with PffArchive(enron_dataset_file) as archive:
            for message in archive.messages():
                _ = archive.format_message(message)

    except Exception as exc:  # pylint: disable=broad-except
        logger.info(f"Inspecting {enron_dataset_file}")
        logger.exception(exc)
示例#7
0
def test_get_transport_headers_from_sent_items(enron_dataset_part004):

    for pst_file in enron_dataset_part004.glob("*.pst"):
        with PffArchive(pst_file) as archive:
            for folder in archive.folders():
                try:
                    name = folder.name.lower()
                except AttributeError:
                    # pylint: disable=no-member
                    if folder.identifier != archive._data.root_folder.identifier:
                        raise
                    continue
                if "sent mail" in name or "sent items" in name:
                    for message in folder.sub_messages:
                        assert message.transport_headers
示例#8
0
        def load_pst(self, filename):

            mailbox_path = Path()

            report = {
                'Files': 0,
                'Messages': 0,
                'Attachments': 0,
                'Size': 0,
                'Errors': 0
            }

            # Start displaying results
            files = sorted(mailbox_path.glob(filename))
            identifier_set = Counter()
            # Iterate over files
            for pst_file in files:
                try:
                    # Iterate over messages
                    with PffArchive(pst_file) as archive:

                        for message in archive.messages():
                            try:

                                identifiers = re.findall(
                                    "<[^<>]*@{1}[^<>]*>",
                                    message.transport_headers)
                                for each in identifiers:
                                    identifier_set[each[1:-1]] += 1
                                # Update report
                                report['Messages'] += 1
                                report[
                                    'Attachments'] += message.number_of_attachments

                            except Exception as exc:
                                # Log error and move on to the next message
                                report['Errors'] += 1

                except Exception as exc:
                    # Log error and move on to the next file
                    report['Errors'] += 1

                # Update report
                report['Files'] += 1
                report['Size'] += pst_file.stat().st_size

            self.identifiers = set(identifier_set.keys())
            self.identifiers.add("*****@*****.**")
示例#9
0
def test_extract_enron_messages(enron_dataset):
    nb_extracted = 0
    total_size = 0

    for pst_file in enron_dataset.glob("**/*.pst"):
        try:
            # Iterate over messages and copy message string
            with PffArchive(pst_file) as archive:
                for message in archive.messages():
                    _ = archive.format_message(message)

                    # Increment message count
                    nb_extracted += 1

            # Add file size to running total
            total_size += pst_file.stat().st_size

        except Exception as exc:  # pylint: disable=broad-except
            logger.info(f"Inspecting {pst_file}")
            logger.exception(exc)

    logger.info(
        f"Extracted {nb_extracted} messages from a total of {humanfriendly.format_size(total_size)}"
    )
示例#10
0
def test_pffarchive_iterate_over_messages(sample_pst_file, bfs):

    with PffArchive(sample_pst_file) as archive:
        for message in archive.messages(bfs=bfs):
            assert message.plain_text_body
示例#11
0
def test_pffarchive_load_from_invalid_type():

    with pytest.raises(TypeError):
        _ = PffArchive(1)
示例#12
0
def test_pffarchive_load_from_file_object(sample_pst_file):

    with sample_pst_file.open(mode="rb") as f, PffArchive(f) as archive:
        assert len(list(archive.messages())) == 2668
示例#13
0
def test_get_message_body(message, body_type):
    assert PffArchive().get_message_body(message)[1] is body_type
示例#14
0
def test_get_attachment_metadata(mock_cls):
    message = MagicMock(identifier=123,
                        attachments=[mock_cls(name="foo", size="0")])

    assert PffArchive().get_attachment_metadata(message)[0].mime_type is None
示例#15
0
def test_get_message_by_id_with_bad_id(sample_pst_file):
    with PffArchive(sample_pst_file) as archive:
        assert archive.get_message_by_id(1234) is None