Пример #1
0
def test_scan_files_with_interrupt(directory_of_mbox_files):

    tmp_filename = "test.sqlite3"

    with TemporaryDirectory() as tmpdir:

        destination = Path(tmpdir) / tmp_filename
        Session = db_init(destination)

        with db_session(Session) as session, patch(
                "libratom.lib.report.FileReport",
                new=MagicMock(side_effect=KeyboardInterrupt),
        ):

            assert (scan_files(
                files=get_set_of_files(directory_of_mbox_files),
                session=session,
                jobs=2,
            ) == 1)
Пример #2
0
def test_extract_entities_from_mbox_files(directory_of_mbox_files):

    tmp_filename = "test.sqlite3"

    with TemporaryDirectory() as tmpdir:

        destination = Path(tmpdir) / tmp_filename
        Session = db_init(destination)

        with db_session(Session) as session:

            status = extract_entities(
                files=get_set_of_files(directory_of_mbox_files),
                session=session,
                spacy_model=load_spacy_model(SPACY_MODELS.en_core_web_sm)[0],
                jobs=2,
            )

        assert status == 0
Пример #3
0
def test_run_function_with_interrupt(directory_of_mbox_files, function,
                                     patched, kwargs):

    tmp_filename = "test.sqlite3"

    with TemporaryDirectory() as tmpdir:

        destination = Path(tmpdir) / tmp_filename
        Session = db_init(destination)

        with db_session(Session) as session, patch(
                patched,
                new=MagicMock(side_effect=KeyboardInterrupt),
        ):

            status = function(
                files=get_set_of_files(directory_of_mbox_files),
                session=session,
                **kwargs,
            )

        assert status == 1
Пример #4
0
def entities(
    out: Path,
    spacy_model_name: str,
    jobs: Optional[int],
    src: Path,
    include_message_contents: bool = False,
    progress: bool = False,
) -> int:
    """
    Click sub command function called by `ratom entities`
    """

    # Make or fake our progress bar context objects
    if progress:
        progress_bars = enlighten.get_manager()
        progress_bar_context = progress_bars.counter
    else:
        progress_bar_context = MockContext

    # Resolve output file based on src parameter
    if out.is_dir():
        out = out / OUTPUT_FILENAME_TEMPLATE.format(
            src.name,
            "entities",
            datetime.now().isoformat(timespec="seconds").translate(
                str.maketrans({
                    "-": "",
                    ":": ""
                })),
        )

    # Make DB file's parents if needed
    out.parent.mkdir(parents=True, exist_ok=True)

    # DB setup
    Session = db_init(out)

    # Get set of PST files from the source
    files = get_set_of_files(src)

    if not files:
        logger.info(f"No PST file found in {src}")

    # Compute and store file information
    with progress_bar_context(
            total=len(files),
            desc="Initial file scan",
            unit="files",
            color="green",
            leave=False,
    ) as file_bar, db_session(Session) as session:
        status = scan_files(files,
                            session,
                            jobs=jobs,
                            progress_callback=file_bar.update)

    if status == 1:
        logger.warning("Aborting")
        return status

    # Try loading the spaCy model in case we need to download it first,
    # but don't cache it in the main process
    logger.info(f"Loading spaCy model: {spacy_model_name}")
    spacy_model = load_spacy_model(spacy_model_name)
    if not spacy_model:
        return 1

    # Try to see if we're using a stale model version
    spacy_model_version = spacy_model.meta.get("version")
    try:
        latest_version = get_spacy_models()[spacy_model_name][0]
        if parse(latest_version) > parse(spacy_model_version):
            logger.info(
                f"Model {spacy_model_name} {spacy_model_version} will be used, but {latest_version} is available"
            )
    except Exception as exc:
        logger.debug(exc, exc_info=True)

    # Get messages and extract entities
    with db_session(Session) as session:

        # Record configuration info
        store_configuration_in_db(session, str(src), jobs, spacy_model_name,
                                  spacy_model_version)

        # Get total message count
        msg_count = session.query(func.sum(FileReport.msg_count)).scalar()

        # Get list of good files
        good_files = [
            Path(file.path) for file in session.query(FileReport).filter(
                FileReport.error.is_(None))
        ]

        with progress_bar_context(
                total=msg_count,
                desc="Processing messages",
                unit="msg",
                color="blue") as processing_msg_bar, progress_bar_context(
                    total=msg_count,
                    desc="Generating message reports",
                    unit="msg",
                    color="green",
                ) as reporting_msg_bar:

            status = extract_entities(
                files=good_files,
                session=session,
                spacy_model_name=spacy_model_name,
                include_message_contents=include_message_contents,
                jobs=jobs,
                processing_progress_callback=processing_msg_bar.update,
                reporting_progress_callback=reporting_msg_bar.update,
            )

    logger.info("All done")

    return status
Пример #5
0
def report(
    out: Path,
    jobs: Optional[int],
    src: Path,
    include_message_contents: bool = False,
    progress: bool = False,
) -> int:
    """
    Click sub command function called by `ratom report`
    """

    # Make or fake our progress bar context objects
    if progress:
        progress_bars = enlighten.get_manager()
        progress_bar_context = progress_bars.counter
    else:
        progress_bar_context = MockContext

    # Resolve output file based on src parameter
    if out.is_dir():
        out = out / OUTPUT_FILENAME_TEMPLATE.format(
            src.name,
            "report",
            datetime.now().isoformat(timespec="seconds").translate(
                str.maketrans({
                    "-": "",
                    ":": ""
                })),
        )

    # Make DB file's parents if needed
    out.parent.mkdir(parents=True, exist_ok=True)

    # DB setup
    Session = db_init(out)

    # Get set of PST files from the source
    files = get_set_of_files(src)

    if not files:
        logger.info(f"No PST file found in {src}")

    # Compute and store file information
    with progress_bar_context(
            total=len(files),
            desc="Initial file scan",
            unit="files",
            color="green",
            leave=False,
    ) as file_bar, db_session(Session) as session:
        status = scan_files(files,
                            session,
                            jobs=jobs,
                            progress_callback=file_bar.update)

    if status == 1:
        logger.warning("Aborting")
        return status

    # Get messages and extract entities
    with db_session(Session) as session:

        # Record configuration info
        store_configuration_in_db(session, str(src), jobs)

        # Get total message count
        msg_count = session.query(func.sum(FileReport.msg_count)).scalar()

        # Get list of good files
        good_files = [
            Path(file.path) for file in session.query(FileReport).filter(
                FileReport.error.is_(None))
        ]

        with progress_bar_context(total=msg_count,
                                  desc="Processing messages",
                                  unit="msg",
                                  color="green") as msg_bar:

            status = generate_report(
                files=good_files,
                session=session,
                include_message_contents=include_message_contents,
                progress_callback=msg_bar.update,
            )

    logger.info("All done")

    return status