예제 #1
0
    def __init__(self, settings_module=""):
        self._config, settings_module = get_config(settings_module)

        rabbit_crud_queue = self._config.RABBITMQ_CRUD_QUEUE
        self._background_consumer = BackgroundConsumer(
            self._rabbit_server_details(), rabbit_crud_queue,
            self._rabbit_message_processor().process_message)
예제 #2
0
def test_update_filtered_positives_omitting_dart_outputs_success(
    mock_helper_imports, mock_filter_out_cherrypicked_samples, mock_update_positives, mock_helper_database_updates
):
    _, mock_get_positive_samples = mock_helper_imports
    mock_update_mongo, mock_update_mlwh, mock_update_dart = mock_helper_database_updates

    # mock a successful update
    mock_get_positive_samples.return_value = [{"plate_barcode": "123"}, {"plate_barcode": "456"}]
    non_cp_samples = [{"plate_barcode": "123"}]
    mock_filter_out_cherrypicked_samples.return_value = non_cp_samples
    mock_update_mongo.return_value = True
    mock_update_mlwh.return_value = True
    mock_update_dart.return_value = True

    version = "v2.3"
    mock_pos_id = MagicMock()
    type(mock_pos_id).version = PropertyMock(return_value=version)
    with patch("migrations.update_filtered_positives.current_filtered_positive_identifier", return_value=mock_pos_id):
        with patch("migrations.update_filtered_positives.datetime") as mock_datetime:
            timestamp = datetime.utcnow()
            mock_datetime.utcnow.return_value = timestamp

            # call the migration
            update_filtered_positives.run("crawler.config.integration", True)

            # ensure expected database calls
            config, _ = get_config("crawler.config.integration")
            mock_update_mongo.assert_called_once()
            mock_update_mongo.assert_called_with(config, non_cp_samples, version, timestamp)
            mock_update_mlwh.assert_called_once()
            mock_update_mlwh.assert_called_with(config, non_cp_samples)
            mock_update_dart.assert_not_called()
예제 #3
0
def process(run_id: str, config: Config = None) -> List[List[str]]:
    """Generates cherrypicker test data for processing by Crawler and then
    processes it via the usual runner.

    The specification of the plates to be generated should be in Mongo. Each
    plate will contain an exact number of positive results between 0 and 96 as
    specified. Up to 200 plates can be generated at a time.

    Arguments:
        run_id: str - The ID of the run.  If this is not found in Mongo an
            exception will be thrown.

    Returns:
        Metadata about the plates generated, as:
        [ [ "barcode1", "description1" ], [ "barcode2", "description2" ] ]
    """
    logger.info("Begin generating data.")

    if config is None:
        config, _ = get_config()

    with create_mongo_client(config) as mongo_client:
        mongo_db = get_mongo_db(config, mongo_client)
        collection = get_mongo_collection(mongo_db,
                                          COLLECTION_CHERRYPICK_TEST_DATA)

        return process_run(config, collection, run_id)
예제 #4
0
파일: __init__.py 프로젝트: sanger/crawler
def create_app(config_object: str = None) -> flask.Flask:
    app = flask.Flask(__name__)

    if config_object is None:
        app.config.from_object(os.environ["SETTINGS_MODULE"])
    else:
        app.config.from_object(config_object)

    # Setup logging
    logging.config.dictConfig(app.config["LOGGING"])

    if app.config.get("SCHEDULER_RUN", False):
        scheduler.init_app(app)
        scheduler.start()

    config, _ = get_config(config_object or "")
    setup_mongo_indexes(config)
    start_rabbit_consumer(config)
    setup_routes(app)

    @app.get("/health")
    def _health_check():
        """Checks the health of Crawler by checking that there is a scheduled job to run Crawler periodically and an
        instance of the Rabbit Stack subscribed to the message queue or waiting to reconnect.
        """
        if scheduler.get_job(
                SCHEDULER_JOB_ID_RUN_CRAWLER) and rabbit_stack.is_healthy:
            return "Crawler is working", HTTPStatus.OK

        return "Crawler is not working correctly", HTTPStatus.INTERNAL_SERVER_ERROR

    return app
예제 #5
0
def scheduled_run():
    """Scheduler's job to do a run every 30 minutes."""
    config, _ = get_config()
    logging.config.dictConfig(config.LOGGING)

    logger.info("Starting scheduled_run job.")

    with scheduler.app.app_context():
        use_sftp = app.config["USE_SFTP"]
        keep_files = app.config["KEEP_FILES"]
        add_to_dart = app.config["ADD_TO_DART"]
        run(use_sftp, keep_files, add_to_dart)
예제 #6
0
파일: main.py 프로젝트: sanger/crawler
def run(sftp: bool, keep_files: bool, add_to_dart: bool, settings_module: str = "", centre_prefix: str = "") -> None:
    try:
        start = time.time()
        config, settings_module = get_config(settings_module)

        logging.config.dictConfig(config.LOGGING)

        logger.info("-" * 80)
        logger.info("START")
        logger.info(f"Using settings from {settings_module}")

        # get or create the centres collection and filter down to only those with an SFTP data source
        centres = get_centres_config(config, CENTRE_DATA_SOURCE_SFTP)

        with create_mongo_client(config) as client:
            db = get_mongo_db(config, client)
            ensure_mongo_collections_indexed(db)

            if centre_prefix:
                # We are only interested in processing a single centre
                centres = list(filter(lambda config: config.get(CENTRE_KEY_PREFIX) == centre_prefix, centres))
            else:
                # We should only include centres that are to be batch processed
                centres = list(filter(lambda config: config.get(CENTRE_KEY_INCLUDE_IN_SCHEDULED_RUNS, True), centres))

            centres_instances = [Centre(config, centre_config) for centre_config in centres]

            for centre_instance in centres_instances:
                logger.info("*" * 80)
                logger.info(f"Processing {centre_instance.centre_config[CENTRE_KEY_NAME]}")

                try:
                    if sftp:
                        centre_instance.download_csv_files()

                    centre_instance.process_files(add_to_dart)
                except Exception as e:
                    logger.error(f"Error in centre '{centre_instance.centre_config[CENTRE_KEY_NAME]}'")
                    logger.exception(e)
                finally:
                    if not keep_files and centre_instance.is_download_dir_walkable:
                        centre_instance.clean_up()

                # Prioritisation of samples
                update_priority_samples(db, config, add_to_dart)

        logger.info(f"Import complete in {round(time.time() - start, 2)}s")
        logger.info("=" * 80)
    except Exception as e:
        logger.exception(e)
예제 #7
0
def run(settings_module: str = "", omit_dart: bool = False) -> None:
    """Updates filtered positive values for all positive samples in pending plates

    Arguments:
        settings_module {str} -- settings module from which to generate the app config
        omit_dart {bool} -- whether to omit DART queries/updates from the process
    """
    config, settings_module = get_config(settings_module)
    logging.config.dictConfig(config.LOGGING)

    logger.info("-" * 80)
    logger.info("STARTING FILTERED POSITIVES UPDATE")
    logger.info(f"Time start: {datetime.now()}")

    num_pending_plates = 0
    num_pos_samples = 0
    num_non_cp_pos_samples = 0
    mongo_updated = False
    mlwh_updated = False
    dart_updated = False
    try:
        samples = []
        if omit_dart:
            # Get positive result samples from Mongo
            logger.warning("Omitting DART from this update")
            samples = positive_result_samples_from_mongo(config)
        else:
            # Get barcodes of pending plates in DART
            logger.info("Selecting pending plates from DART...")
            pending_plate_barcodes = pending_plate_barcodes_from_dart(config)

            if num_pending_plates := len(pending_plate_barcodes):
                logger.info(
                    f"{num_pending_plates} pending plates found in DART")

                # Get positive result samples from Mongo in these pending plates
                logger.info(
                    "Selecting postive samples in pending plates from Mongo..."
                )
                samples = positive_result_samples_from_mongo(
                    config, pending_plate_barcodes)
            else:
예제 #8
0
def test_get_config():
    with pytest.raises(ModuleNotFoundError):
        get_config("x.y.z")
def run(settings_module: str = "",
        s_start_datetime: str = "",
        s_end_datetime: str = "") -> None:
    """Migrate the existing samples to have the filtered positive values.

    Arguments:
        settings_module {str} -- settings module from which to generate the app config
    """
    if not valid_datetime_string(s_start_datetime):
        logger.error(
            "Aborting run: Expected format of Start datetime is YYMMDD_HHmm")
        return

    if not valid_datetime_string(s_end_datetime):
        logger.error(
            "Aborting run: Expected format of End datetime is YYMMDD_HHmm")
        return

    start_datetime = datetime.strptime(s_start_datetime, MONGO_DATETIME_FORMAT)
    end_datetime = datetime.strptime(s_end_datetime, MONGO_DATETIME_FORMAT)
    fields_set_datetime = datetime.strptime(FILTERED_POSITIVE_FIELDS_SET_DATE,
                                            "%Y-%m-%d")

    if start_datetime > end_datetime:
        logger.error(
            "Aborting run: End datetime must be greater than Start datetime")
        return

    if end_datetime > fields_set_datetime:
        logger.error(
            "Aborting run: Date range must be prior to the 17th December")
        return

    config, settings_module = get_config(settings_module)

    logging.config.dictConfig(config.LOGGING)

    logger.info("-" * 80)
    logger.info("STARTING FILTERED POSITIVES LEGACY UPDATE")
    logger.info(f"Time start: {datetime.now()}")
    start_time = time.time()

    updated_key = "Updated"
    time_key = "Time taken"

    mongo_versions_updated = {
        FILTERED_POSITIVE_VERSION_0: {
            updated_key: False,
            time_key: 0.0
        },
        FILTERED_POSITIVE_VERSION_1: {
            updated_key: False,
            time_key: 0.0
        },
        FILTERED_POSITIVE_VERSION_2: {
            updated_key: False,
            time_key: 0.0
        },
    }

    mlwh_versions_updated = {
        FILTERED_POSITIVE_VERSION_0: {
            updated_key: False,
            time_key: 0.0
        },
        FILTERED_POSITIVE_VERSION_1: {
            updated_key: False,
            time_key: 0.0
        },
        FILTERED_POSITIVE_VERSION_2: {
            updated_key: False,
            time_key: 0.0
        },
    }

    try:
        continue_migration = pre_migration_filtered_positive_check(
            config, start_datetime, end_datetime)

        if continue_migration:
            logger.info(
                f"Selecting legacy samples from Mongo between {start_datetime} and {end_datetime}..."
            )
            samples = mongo_samples_by_date(config, start_datetime,
                                            end_datetime)

            legacy_samples_num = len(samples)
            logger.info(f"{legacy_samples_num} samples found from Mongo")

            root_sample_ids, plate_barcodes = extract_required_cp_info(samples)

            logger.info("Querying for v0 cherrypicked samples from MLWH")
            # Get v0 cherrypicked samples
            v0_cp_samples_df = get_cherrypicked_samples_by_date(
                config,
                list(root_sample_ids),
                list(plate_barcodes),
                "1970-01-01 00:00:01",
                V0_V1_CUTOFF_TIMESTAMP,
            )

            logger.debug(
                f"Found {len(v0_cp_samples_df.index)} v0 cherrypicked samples"
            )  # type: ignore

            logger.info("Querying for cherrypicked samples from MLWH")
            # Get v1 cherrypicked samples
            v1_cp_samples_df = get_cherrypicked_samples_by_date(
                config,
                list(root_sample_ids),
                list(plate_barcodes),
                V0_V1_CUTOFF_TIMESTAMP,
                V1_V2_CUTOFF_TIMESTAMP,
            )

            logger.debug(
                f"Found {len(v1_cp_samples_df.index)} v1 cherrypicked samples"
            )  # type: ignore

            logger.info("Splitting samples by version...")
            samples_by_version = split_mongo_samples_by_version(
                samples, v0_cp_samples_df, v1_cp_samples_df)

            update_timestamp = datetime.now()

            for version, version_samples in samples_by_version.items():
                filtered_positive_identifier = filtered_positive_identifier_by_version(
                    version)
                logger.info(f"Updating {version} filtered positives...")
                update_filtered_positive_fields(
                    filtered_positive_identifier,
                    version_samples,
                    version,
                    update_timestamp,
                )

            logger.info("Updated filtered positives")

            logger.info("Updating Mongo")

            for version, version_samples in samples_by_version.items():
                logger.info(
                    f"Updating {version} filtered positives in Mongo, total {len(version_samples)} records..."
                )
                mongo_update_start_time = time.time()
                mongo_updated = update_mongo_filtered_positive_fields(
                    config,
                    version_samples,
                    version,
                    update_timestamp,
                )
                if mongo_updated:
                    logger.info(
                        f"Finished updating {version} filtered positives in Mongo"
                    )

                    mongo_update_end_time = time.time()
                    mongo_versions_updated[version][updated_key] = True
                    mongo_versions_updated[version][time_key] = round(
                        mongo_update_end_time - mongo_update_start_time, 2)

                    logger.info(
                        f"Updating {version} filtered positives in MLWH...")
                    mlwh_update_start_time = time.time()

                    mlwh_updated = update_mlwh_filtered_positive_fields_batched(
                        config, version_samples, version, update_timestamp)

                    if mlwh_updated:
                        logger.info(
                            f"Finished updating {version} filtered positives in MLWH"
                        )

                        mlwh_update_end_time = time.time()
                        mlwh_versions_updated[version][updated_key] = True
                        mlwh_versions_updated[version][time_key] = round(
                            mlwh_update_end_time - mlwh_update_start_time, 2)

            logger.info("Finished updating databases")
        else:
            logger.info("Now exiting migration")
    except Exception as e:
        logger.error("---------- Process aborted: ----------")
        logger.error(f"An exception occurred, at {datetime.now()}")
        logger.exception(e)
        raise
    finally:
        end_time = time.time()
        logger.info(f"""
        ---------- Processing status of filtered positive field migration: ----------
        -- Mongo updated with v0 filtered positives: \
{mongo_versions_updated[FILTERED_POSITIVE_VERSION_0][updated_key]}, \
time taken: \
{mongo_versions_updated[FILTERED_POSITIVE_VERSION_0][time_key]}s
        -- Mongo updated with v1 filtered positives: \
{mongo_versions_updated[FILTERED_POSITIVE_VERSION_1][updated_key]}, \
time taken: \
{mongo_versions_updated[FILTERED_POSITIVE_VERSION_1][time_key]}s
        -- Mongo updated with v2 filtered positives: \
{mongo_versions_updated[FILTERED_POSITIVE_VERSION_2][updated_key]}, \
time taken: \
{mongo_versions_updated[FILTERED_POSITIVE_VERSION_2][time_key]}s
        -- MLWH updated with v0 filtered positives: \
{mlwh_versions_updated[FILTERED_POSITIVE_VERSION_0][updated_key]}, \
time taken: \
{mlwh_versions_updated[FILTERED_POSITIVE_VERSION_0][time_key]}s
        -- MLWH updated with v1 filtered positives: \
{mlwh_versions_updated[FILTERED_POSITIVE_VERSION_1][updated_key]}, \
time taken: \
{mlwh_versions_updated[FILTERED_POSITIVE_VERSION_1][time_key]}s
        -- MLWH updated with v2 filtered positives: \
{mlwh_versions_updated[FILTERED_POSITIVE_VERSION_2][updated_key]}, \
time taken: \
{mlwh_versions_updated[FILTERED_POSITIVE_VERSION_2][time_key]}s
        """)

    logger.info(f"Time finished: {datetime.now()}")
    logger.info(f"Migration complete in {round(end_time - start_time, 2)}s")
    logger.info("=" * 80)
예제 #10
0
파일: runner.py 프로젝트: sanger/crawler
def centre_prefix_choices():
    config, _ = get_config("")
    centres = get_centres_config(config, "SFTP")

    return [centre[CENTRE_KEY_PREFIX] for centre in centres]
예제 #11
0
def run(settings_module: str = "") -> None:
    config, settings_module = get_config(settings_module)

    with create_mongo_client(config) as client:
        db = get_mongo_db(config, client)
        sample_timestamps_helper.add_timestamps_to_samples(db)
예제 #12
0
def run(sftp: bool,
        keep_files: bool,
        add_to_dart: bool,
        settings_module: str = "") -> None:
    try:
        start = time.time()
        config, settings_module = get_config(settings_module)

        logging.config.dictConfig(config.LOGGING)

        logger.info("-" * 80)
        logger.info("START")
        logger.info(f"Using settings from {settings_module}")

        centres = config.CENTRES

        with create_mongo_client(config) as client:
            db = get_mongo_db(config, client)

            # get or create the centres collection
            centres_collection = get_mongo_collection(db, COLLECTION_CENTRES)

            logger.debug(
                f"Creating index '{FIELD_CENTRE_NAME}' on '{centres_collection.full_name}'"
            )
            centres_collection.create_index(FIELD_CENTRE_NAME, unique=True)
            populate_collection(centres_collection, centres, FIELD_CENTRE_NAME)

            # get or create the source plates collection
            source_plates_collection = get_mongo_collection(
                db, COLLECTION_SOURCE_PLATES)

            logger.debug(
                f"Creating index '{FIELD_BARCODE}' on '{source_plates_collection.full_name}'"
            )
            source_plates_collection.create_index(FIELD_BARCODE, unique=True)

            logger.debug(
                f"Creating index '{FIELD_LH_SOURCE_PLATE_UUID}' on '{source_plates_collection.full_name}'"
            )
            source_plates_collection.create_index(FIELD_LH_SOURCE_PLATE_UUID,
                                                  unique=True)

            with samples_collection_accessor(
                    db, COLLECTION_SAMPLES) as samples_collection:
                # Index on plate barcode to make it easier to select based on plate barcode
                logger.debug(
                    f"Creating index '{FIELD_PLATE_BARCODE}' on '{samples_collection.full_name}'"
                )
                samples_collection.create_index(FIELD_PLATE_BARCODE)

                # Index on result column to make it easier to select the positives
                logger.debug(
                    f"Creating index '{FIELD_RESULT}' on '{samples_collection.full_name}'"
                )
                samples_collection.create_index(FIELD_RESULT)

                # Index on unique combination of columns
                logger.debug(
                    f"Creating compound index on '{samples_collection.full_name}'"
                )
                # create compound index on 'Root Sample ID', 'RNA ID', 'Result', 'Lab ID' - some
                # data had the same plate tested at another time so ignore the data if it is exactly
                # the same
                samples_collection.create_index(
                    [
                        (FIELD_ROOT_SAMPLE_ID, pymongo.ASCENDING),
                        (FIELD_RNA_ID, pymongo.ASCENDING),
                        (FIELD_RESULT, pymongo.ASCENDING),
                        (FIELD_LAB_ID, pymongo.ASCENDING),
                    ],
                    unique=True,
                )

                # Index on lh_source_plate_uuid column
                # Added to make lighthouse API source completion event call query more efficient
                logger.debug(
                    f"Creating index '{FIELD_LH_SOURCE_PLATE_UUID}' on '{samples_collection.full_name}'"
                )
                samples_collection.create_index(FIELD_LH_SOURCE_PLATE_UUID)

                centres_instances = [
                    Centre(config, centre_config) for centre_config in centres
                ]
                for centre_instance in centres_instances:
                    logger.info("*" * 80)
                    logger.info(
                        f"Processing {centre_instance.centre_config['name']}")

                    try:
                        if sftp:
                            centre_instance.download_csv_files()

                        centre_instance.process_files(add_to_dart)
                    except Exception as e:
                        logger.error("An exception occured")
                        logger.error(
                            f"Error in centre {centre_instance.centre_config['name']}"
                        )
                        logger.exception(e)
                    finally:
                        if not keep_files and centre_instance.is_download_dir_walkable:
                            centre_instance.clean_up()

        logger.info(f"Import complete in {round(time.time() - start, 2)}s")
        logger.info("=" * 80)
    except Exception as e:
        logger.exception(e)
예제 #13
0
import logging
import logging.config
import sys

from crawler.helpers.general_helpers import get_config
from migrations import (
    sample_timestamps,
    update_dart,
    update_filtered_positives,
    update_legacy_filtered_positives,
    update_mlwh_with_legacy_samples,
)

config, settings_module = get_config("")

logger = logging.getLogger(__name__)
config.LOGGING["loggers"]["crawler"]["level"] = "DEBUG"
config.LOGGING["loggers"]["crawler"]["handlers"] = ["colored_stream"]
config.LOGGING["formatters"]["colored"][
    "format"] = "%(asctime)-15s %(name)-60s:%(lineno)-3s %(log_color)s%(levelname)-7s %(message)s"

logging.config.dictConfig(config.LOGGING)

##
# Examples of how to run from command line:
# python run_migration.py sample_timestamps
# python run_migration.py update_mlwh_with_legacy_samples 200115_1200 200216_0900
# python run_migration.py update_mlwh_and_dart_with_legacy_samples 200115_1200 200216_0900
# python run_migration.py update_filtered_positives
##
예제 #14
0
from crawler.helpers.db_helpers import ensure_mongo_collections_indexed
from crawler.helpers.general_helpers import get_config, get_sftp_connection
from tests.testing_objects import (
    EVENT_WH_DATA,
    FILTERED_POSITIVE_TESTING_SAMPLES,
    MLWH_SAMPLE_LIGHTHOUSE_SAMPLE,
    MLWH_SAMPLE_STOCK_RESOURCE,
    MLWH_SAMPLES_WITH_FILTERED_POSITIVE_FIELDS,
    MONGO_SAMPLES_WITH_FILTERED_POSITIVE_FIELDS,
    MONGO_SAMPLES_WITHOUT_FILTERED_POSITIVE_FIELDS,
    TESTING_PRIORITY_SAMPLES,
    TESTING_SAMPLES,
)

logger = logging.getLogger(__name__)
CONFIG, _ = get_config("crawler.config.test")
logging.config.dictConfig(CONFIG.LOGGING)


@pytest.fixture
def app():
    app = create_app("crawler.config.test")
    yield app


@pytest.fixture
def client(app):
    return app.test_client()


@pytest.fixture