def test_ratom_report_enron_027(isolated_cli_runner, enron_dataset_part027, params, expected): msg_id = 2390436 result = generate_report(params, enron_dataset_part027, isolated_cli_runner, expected) with db_session_from_cmd_out(result) as session: # Verify total message count assert session.query(Message).count() == 9297 # Get message contents from DB msg = session.query(Message).filter_by(pff_identifier=msg_id).one() headers, body = msg.headers, msg.body if expected.with_messages: # Access message directly and compare archive_file = list(enron_dataset_part027.glob("*.pst"))[0] with open_mail_archive(archive_file) as archive: message = archive.get_message_by_id(msg_id) assert cleanup_message_body( *archive.get_message_body(message)) == body assert archive.get_message_headers(message) == headers else: assert headers is None assert body is None
def test_get_mbox_message_by_id(sample_mbox_file): with open_mail_archive(sample_mbox_file) as archive: assert archive.message_count == 113 for index, message in enumerate(archive.messages(), start=1): msg = archive.get_message_by_id(index) assert extract_message_from_archive(archive, index) assert archive.format_message(msg) == archive.format_message( message) assert archive.get_message_headers(message)
def test_ratom_entities_enron_001(isolated_cli_runner, enron_dataset_part001, params, expected): msg_id = 2097572 # Run entity extraction job with message content flag on result = extract_entities(params, enron_dataset_part001, isolated_cli_runner, expected) # Get message contents from DB with db_session_from_cmd_out(result) as session: msg = session.query(Message).filter_by(pff_identifier=msg_id).one() headers, body = msg.headers, msg.body # Access message directly and compare archive_file = list(enron_dataset_part001.glob("*.pst"))[0] with open_mail_archive(archive_file) as archive: message = archive.get_message_by_id(msg_id) assert cleanup_message_body(*archive.get_message_body(message)) == body assert archive.get_message_headers(message) == headers
def test_apply_spacy_model(sample_pst_file, model_name, expected_entity_types): # Extract a known (short) message msg_id = 2112164 with open_mail_archive(sample_pst_file) as archive: msg_body = archive.get_message_body( archive.get_message_by_id(msg_id))[0] # Sanity check assert len(msg_body) == 564 # Pre-load our model to install any missing dependencies assert get_cached_spacy_model(model_name) # Apply our model pretending to be in a forked process with patch( "libratom.lib.entities.current_process") as mock_current_process: mock_current_process.return_value.name = "NotMainProcess" # pylint:disable=no-value-for-parameter res, error = process_message( # Must use dictionary form if function is called explicitly { "filepath": sample_pst_file, "message_id": msg_id, "date": datetime.datetime.utcnow(), "body": msg_body, "body_type": BodyType.PLAIN, "spacy_model_name": model_name, "attachments": None, }) assert res and not error # Check that the expected entity types were found assert expected_entity_types.issubset( set(entity[1] for entity in res["entities"]))
def get_file_info(path: Path) -> Tuple[Dict, Optional[str]]: """ For a given file path, returns the size, md5 and sha256 checksums """ path_str, name = str(path), path.name res = {"path": path_str, "name": name} try: size = os.stat(path_str).st_size md5 = hashlib.md5() sha256 = hashlib.sha256() # First we read the file one block at a time and update digests with open(path_str, "rb") as f: for block in iter(partial(f.read, 128), b""): md5.update(block) sha256.update(block) md5, sha256 = md5.hexdigest(), sha256.hexdigest() res.update({"size": size, "md5": md5, "sha256": sha256}) # Then we try to get a message count try: with open_mail_archive(path) as archive: res["msg_count"] = archive.message_count except Exception as exc: res["error"] = str(exc) except Exception as exc: return res, str(exc) return res, None
def get_messages( files: Iterable[Path], progress_callback: Callable, with_content=True, with_headers=False, **kwargs, ) -> Generator[Dict, None, None]: """ Message generator to feed a pool of processes from a directory of PST files """ msg_count = 0 # Iterate over files for file in files: try: with open_mail_archive(file) as archive: # Iterate over messages for message in archive.messages(): try: # Keyword arguments for process_message() res = { "filepath": archive.filepath, "message_id": getattr(message, "identifier", None), "attachments": archive.get_attachment_metadata(message), } try: res["date"] = archive.get_message_date(message) except Exception as exc: res["date"] = None logger.debug( "Unable to extract date from message: {message_id} in file: {filepath}" .format(**res)) logger.debug(exc, exc_info=True) if with_content: body, body_type = archive.get_message_body(message) res["body"] = body res["body_type"] = body_type if with_headers: res["headers"] = archive.get_message_headers( message) # Add any optional arguments res.update(kwargs) yield res except Exception as exc: # Log and move on to the next message message_id = getattr(message, "identifier", None) message_str = (f"message {message_id}" if message_id else "a message") logger.info(f"Skipping {message_str} from {file}") logger.debug(exc, exc_info=True) finally: msg_count += 1 # Update progress every N messages if not msg_count % RATOM_MSG_PROGRESS_STEP: progress_callback(RATOM_MSG_PROGRESS_STEP) except Exception as exc: # Log and move on to the next file logger.info(f"Skipping file {file}") logger.debug(exc, exc_info=True) # Update progress with remaining message count progress_callback(msg_count % RATOM_MSG_PROGRESS_STEP)