def __init__(self, settings_module=""): self._config, settings_module = get_config(settings_module) rabbit_crud_queue = self._config.RABBITMQ_CRUD_QUEUE self._background_consumer = BackgroundConsumer( self._rabbit_server_details(), rabbit_crud_queue, self._rabbit_message_processor().process_message)
def test_update_filtered_positives_omitting_dart_outputs_success( mock_helper_imports, mock_filter_out_cherrypicked_samples, mock_update_positives, mock_helper_database_updates ): _, mock_get_positive_samples = mock_helper_imports mock_update_mongo, mock_update_mlwh, mock_update_dart = mock_helper_database_updates # mock a successful update mock_get_positive_samples.return_value = [{"plate_barcode": "123"}, {"plate_barcode": "456"}] non_cp_samples = [{"plate_barcode": "123"}] mock_filter_out_cherrypicked_samples.return_value = non_cp_samples mock_update_mongo.return_value = True mock_update_mlwh.return_value = True mock_update_dart.return_value = True version = "v2.3" mock_pos_id = MagicMock() type(mock_pos_id).version = PropertyMock(return_value=version) with patch("migrations.update_filtered_positives.current_filtered_positive_identifier", return_value=mock_pos_id): with patch("migrations.update_filtered_positives.datetime") as mock_datetime: timestamp = datetime.utcnow() mock_datetime.utcnow.return_value = timestamp # call the migration update_filtered_positives.run("crawler.config.integration", True) # ensure expected database calls config, _ = get_config("crawler.config.integration") mock_update_mongo.assert_called_once() mock_update_mongo.assert_called_with(config, non_cp_samples, version, timestamp) mock_update_mlwh.assert_called_once() mock_update_mlwh.assert_called_with(config, non_cp_samples) mock_update_dart.assert_not_called()
def process(run_id: str, config: Config = None) -> List[List[str]]: """Generates cherrypicker test data for processing by Crawler and then processes it via the usual runner. The specification of the plates to be generated should be in Mongo. Each plate will contain an exact number of positive results between 0 and 96 as specified. Up to 200 plates can be generated at a time. Arguments: run_id: str - The ID of the run. If this is not found in Mongo an exception will be thrown. Returns: Metadata about the plates generated, as: [ [ "barcode1", "description1" ], [ "barcode2", "description2" ] ] """ logger.info("Begin generating data.") if config is None: config, _ = get_config() with create_mongo_client(config) as mongo_client: mongo_db = get_mongo_db(config, mongo_client) collection = get_mongo_collection(mongo_db, COLLECTION_CHERRYPICK_TEST_DATA) return process_run(config, collection, run_id)
def create_app(config_object: str = None) -> flask.Flask: app = flask.Flask(__name__) if config_object is None: app.config.from_object(os.environ["SETTINGS_MODULE"]) else: app.config.from_object(config_object) # Setup logging logging.config.dictConfig(app.config["LOGGING"]) if app.config.get("SCHEDULER_RUN", False): scheduler.init_app(app) scheduler.start() config, _ = get_config(config_object or "") setup_mongo_indexes(config) start_rabbit_consumer(config) setup_routes(app) @app.get("/health") def _health_check(): """Checks the health of Crawler by checking that there is a scheduled job to run Crawler periodically and an instance of the Rabbit Stack subscribed to the message queue or waiting to reconnect. """ if scheduler.get_job( SCHEDULER_JOB_ID_RUN_CRAWLER) and rabbit_stack.is_healthy: return "Crawler is working", HTTPStatus.OK return "Crawler is not working correctly", HTTPStatus.INTERNAL_SERVER_ERROR return app
def scheduled_run(): """Scheduler's job to do a run every 30 minutes.""" config, _ = get_config() logging.config.dictConfig(config.LOGGING) logger.info("Starting scheduled_run job.") with scheduler.app.app_context(): use_sftp = app.config["USE_SFTP"] keep_files = app.config["KEEP_FILES"] add_to_dart = app.config["ADD_TO_DART"] run(use_sftp, keep_files, add_to_dart)
def run(sftp: bool, keep_files: bool, add_to_dart: bool, settings_module: str = "", centre_prefix: str = "") -> None: try: start = time.time() config, settings_module = get_config(settings_module) logging.config.dictConfig(config.LOGGING) logger.info("-" * 80) logger.info("START") logger.info(f"Using settings from {settings_module}") # get or create the centres collection and filter down to only those with an SFTP data source centres = get_centres_config(config, CENTRE_DATA_SOURCE_SFTP) with create_mongo_client(config) as client: db = get_mongo_db(config, client) ensure_mongo_collections_indexed(db) if centre_prefix: # We are only interested in processing a single centre centres = list(filter(lambda config: config.get(CENTRE_KEY_PREFIX) == centre_prefix, centres)) else: # We should only include centres that are to be batch processed centres = list(filter(lambda config: config.get(CENTRE_KEY_INCLUDE_IN_SCHEDULED_RUNS, True), centres)) centres_instances = [Centre(config, centre_config) for centre_config in centres] for centre_instance in centres_instances: logger.info("*" * 80) logger.info(f"Processing {centre_instance.centre_config[CENTRE_KEY_NAME]}") try: if sftp: centre_instance.download_csv_files() centre_instance.process_files(add_to_dart) except Exception as e: logger.error(f"Error in centre '{centre_instance.centre_config[CENTRE_KEY_NAME]}'") logger.exception(e) finally: if not keep_files and centre_instance.is_download_dir_walkable: centre_instance.clean_up() # Prioritisation of samples update_priority_samples(db, config, add_to_dart) logger.info(f"Import complete in {round(time.time() - start, 2)}s") logger.info("=" * 80) except Exception as e: logger.exception(e)
def run(settings_module: str = "", omit_dart: bool = False) -> None: """Updates filtered positive values for all positive samples in pending plates Arguments: settings_module {str} -- settings module from which to generate the app config omit_dart {bool} -- whether to omit DART queries/updates from the process """ config, settings_module = get_config(settings_module) logging.config.dictConfig(config.LOGGING) logger.info("-" * 80) logger.info("STARTING FILTERED POSITIVES UPDATE") logger.info(f"Time start: {datetime.now()}") num_pending_plates = 0 num_pos_samples = 0 num_non_cp_pos_samples = 0 mongo_updated = False mlwh_updated = False dart_updated = False try: samples = [] if omit_dart: # Get positive result samples from Mongo logger.warning("Omitting DART from this update") samples = positive_result_samples_from_mongo(config) else: # Get barcodes of pending plates in DART logger.info("Selecting pending plates from DART...") pending_plate_barcodes = pending_plate_barcodes_from_dart(config) if num_pending_plates := len(pending_plate_barcodes): logger.info( f"{num_pending_plates} pending plates found in DART") # Get positive result samples from Mongo in these pending plates logger.info( "Selecting postive samples in pending plates from Mongo..." ) samples = positive_result_samples_from_mongo( config, pending_plate_barcodes) else:
def test_get_config(): with pytest.raises(ModuleNotFoundError): get_config("x.y.z")
def run(settings_module: str = "", s_start_datetime: str = "", s_end_datetime: str = "") -> None: """Migrate the existing samples to have the filtered positive values. Arguments: settings_module {str} -- settings module from which to generate the app config """ if not valid_datetime_string(s_start_datetime): logger.error( "Aborting run: Expected format of Start datetime is YYMMDD_HHmm") return if not valid_datetime_string(s_end_datetime): logger.error( "Aborting run: Expected format of End datetime is YYMMDD_HHmm") return start_datetime = datetime.strptime(s_start_datetime, MONGO_DATETIME_FORMAT) end_datetime = datetime.strptime(s_end_datetime, MONGO_DATETIME_FORMAT) fields_set_datetime = datetime.strptime(FILTERED_POSITIVE_FIELDS_SET_DATE, "%Y-%m-%d") if start_datetime > end_datetime: logger.error( "Aborting run: End datetime must be greater than Start datetime") return if end_datetime > fields_set_datetime: logger.error( "Aborting run: Date range must be prior to the 17th December") return config, settings_module = get_config(settings_module) logging.config.dictConfig(config.LOGGING) logger.info("-" * 80) logger.info("STARTING FILTERED POSITIVES LEGACY UPDATE") logger.info(f"Time start: {datetime.now()}") start_time = time.time() updated_key = "Updated" time_key = "Time taken" mongo_versions_updated = { FILTERED_POSITIVE_VERSION_0: { updated_key: False, time_key: 0.0 }, FILTERED_POSITIVE_VERSION_1: { updated_key: False, time_key: 0.0 }, FILTERED_POSITIVE_VERSION_2: { updated_key: False, time_key: 0.0 }, } mlwh_versions_updated = { FILTERED_POSITIVE_VERSION_0: { updated_key: False, time_key: 0.0 }, FILTERED_POSITIVE_VERSION_1: { updated_key: False, time_key: 0.0 }, FILTERED_POSITIVE_VERSION_2: { updated_key: False, time_key: 0.0 }, } try: continue_migration = pre_migration_filtered_positive_check( config, start_datetime, end_datetime) if continue_migration: logger.info( f"Selecting legacy samples from Mongo between {start_datetime} and {end_datetime}..." ) samples = mongo_samples_by_date(config, start_datetime, end_datetime) legacy_samples_num = len(samples) logger.info(f"{legacy_samples_num} samples found from Mongo") root_sample_ids, plate_barcodes = extract_required_cp_info(samples) logger.info("Querying for v0 cherrypicked samples from MLWH") # Get v0 cherrypicked samples v0_cp_samples_df = get_cherrypicked_samples_by_date( config, list(root_sample_ids), list(plate_barcodes), "1970-01-01 00:00:01", V0_V1_CUTOFF_TIMESTAMP, ) logger.debug( f"Found {len(v0_cp_samples_df.index)} v0 cherrypicked samples" ) # type: ignore logger.info("Querying for cherrypicked samples from MLWH") # Get v1 cherrypicked samples v1_cp_samples_df = get_cherrypicked_samples_by_date( config, list(root_sample_ids), list(plate_barcodes), V0_V1_CUTOFF_TIMESTAMP, V1_V2_CUTOFF_TIMESTAMP, ) logger.debug( f"Found {len(v1_cp_samples_df.index)} v1 cherrypicked samples" ) # type: ignore logger.info("Splitting samples by version...") samples_by_version = split_mongo_samples_by_version( samples, v0_cp_samples_df, v1_cp_samples_df) update_timestamp = datetime.now() for version, version_samples in samples_by_version.items(): filtered_positive_identifier = filtered_positive_identifier_by_version( version) logger.info(f"Updating {version} filtered positives...") update_filtered_positive_fields( filtered_positive_identifier, version_samples, version, update_timestamp, ) logger.info("Updated filtered positives") logger.info("Updating Mongo") for version, version_samples in samples_by_version.items(): logger.info( f"Updating {version} filtered positives in Mongo, total {len(version_samples)} records..." ) mongo_update_start_time = time.time() mongo_updated = update_mongo_filtered_positive_fields( config, version_samples, version, update_timestamp, ) if mongo_updated: logger.info( f"Finished updating {version} filtered positives in Mongo" ) mongo_update_end_time = time.time() mongo_versions_updated[version][updated_key] = True mongo_versions_updated[version][time_key] = round( mongo_update_end_time - mongo_update_start_time, 2) logger.info( f"Updating {version} filtered positives in MLWH...") mlwh_update_start_time = time.time() mlwh_updated = update_mlwh_filtered_positive_fields_batched( config, version_samples, version, update_timestamp) if mlwh_updated: logger.info( f"Finished updating {version} filtered positives in MLWH" ) mlwh_update_end_time = time.time() mlwh_versions_updated[version][updated_key] = True mlwh_versions_updated[version][time_key] = round( mlwh_update_end_time - mlwh_update_start_time, 2) logger.info("Finished updating databases") else: logger.info("Now exiting migration") except Exception as e: logger.error("---------- Process aborted: ----------") logger.error(f"An exception occurred, at {datetime.now()}") logger.exception(e) raise finally: end_time = time.time() logger.info(f""" ---------- Processing status of filtered positive field migration: ---------- -- Mongo updated with v0 filtered positives: \ {mongo_versions_updated[FILTERED_POSITIVE_VERSION_0][updated_key]}, \ time taken: \ {mongo_versions_updated[FILTERED_POSITIVE_VERSION_0][time_key]}s -- Mongo updated with v1 filtered positives: \ {mongo_versions_updated[FILTERED_POSITIVE_VERSION_1][updated_key]}, \ time taken: \ {mongo_versions_updated[FILTERED_POSITIVE_VERSION_1][time_key]}s -- Mongo updated with v2 filtered positives: \ {mongo_versions_updated[FILTERED_POSITIVE_VERSION_2][updated_key]}, \ time taken: \ {mongo_versions_updated[FILTERED_POSITIVE_VERSION_2][time_key]}s -- MLWH updated with v0 filtered positives: \ {mlwh_versions_updated[FILTERED_POSITIVE_VERSION_0][updated_key]}, \ time taken: \ {mlwh_versions_updated[FILTERED_POSITIVE_VERSION_0][time_key]}s -- MLWH updated with v1 filtered positives: \ {mlwh_versions_updated[FILTERED_POSITIVE_VERSION_1][updated_key]}, \ time taken: \ {mlwh_versions_updated[FILTERED_POSITIVE_VERSION_1][time_key]}s -- MLWH updated with v2 filtered positives: \ {mlwh_versions_updated[FILTERED_POSITIVE_VERSION_2][updated_key]}, \ time taken: \ {mlwh_versions_updated[FILTERED_POSITIVE_VERSION_2][time_key]}s """) logger.info(f"Time finished: {datetime.now()}") logger.info(f"Migration complete in {round(end_time - start_time, 2)}s") logger.info("=" * 80)
def centre_prefix_choices(): config, _ = get_config("") centres = get_centres_config(config, "SFTP") return [centre[CENTRE_KEY_PREFIX] for centre in centres]
def run(settings_module: str = "") -> None: config, settings_module = get_config(settings_module) with create_mongo_client(config) as client: db = get_mongo_db(config, client) sample_timestamps_helper.add_timestamps_to_samples(db)
def run(sftp: bool, keep_files: bool, add_to_dart: bool, settings_module: str = "") -> None: try: start = time.time() config, settings_module = get_config(settings_module) logging.config.dictConfig(config.LOGGING) logger.info("-" * 80) logger.info("START") logger.info(f"Using settings from {settings_module}") centres = config.CENTRES with create_mongo_client(config) as client: db = get_mongo_db(config, client) # get or create the centres collection centres_collection = get_mongo_collection(db, COLLECTION_CENTRES) logger.debug( f"Creating index '{FIELD_CENTRE_NAME}' on '{centres_collection.full_name}'" ) centres_collection.create_index(FIELD_CENTRE_NAME, unique=True) populate_collection(centres_collection, centres, FIELD_CENTRE_NAME) # get or create the source plates collection source_plates_collection = get_mongo_collection( db, COLLECTION_SOURCE_PLATES) logger.debug( f"Creating index '{FIELD_BARCODE}' on '{source_plates_collection.full_name}'" ) source_plates_collection.create_index(FIELD_BARCODE, unique=True) logger.debug( f"Creating index '{FIELD_LH_SOURCE_PLATE_UUID}' on '{source_plates_collection.full_name}'" ) source_plates_collection.create_index(FIELD_LH_SOURCE_PLATE_UUID, unique=True) with samples_collection_accessor( db, COLLECTION_SAMPLES) as samples_collection: # Index on plate barcode to make it easier to select based on plate barcode logger.debug( f"Creating index '{FIELD_PLATE_BARCODE}' on '{samples_collection.full_name}'" ) samples_collection.create_index(FIELD_PLATE_BARCODE) # Index on result column to make it easier to select the positives logger.debug( f"Creating index '{FIELD_RESULT}' on '{samples_collection.full_name}'" ) samples_collection.create_index(FIELD_RESULT) # Index on unique combination of columns logger.debug( f"Creating compound index on '{samples_collection.full_name}'" ) # create compound index on 'Root Sample ID', 'RNA ID', 'Result', 'Lab ID' - some # data had the same plate tested at another time so ignore the data if it is exactly # the same samples_collection.create_index( [ (FIELD_ROOT_SAMPLE_ID, pymongo.ASCENDING), (FIELD_RNA_ID, pymongo.ASCENDING), (FIELD_RESULT, pymongo.ASCENDING), (FIELD_LAB_ID, pymongo.ASCENDING), ], unique=True, ) # Index on lh_source_plate_uuid column # Added to make lighthouse API source completion event call query more efficient logger.debug( f"Creating index '{FIELD_LH_SOURCE_PLATE_UUID}' on '{samples_collection.full_name}'" ) samples_collection.create_index(FIELD_LH_SOURCE_PLATE_UUID) centres_instances = [ Centre(config, centre_config) for centre_config in centres ] for centre_instance in centres_instances: logger.info("*" * 80) logger.info( f"Processing {centre_instance.centre_config['name']}") try: if sftp: centre_instance.download_csv_files() centre_instance.process_files(add_to_dart) except Exception as e: logger.error("An exception occured") logger.error( f"Error in centre {centre_instance.centre_config['name']}" ) logger.exception(e) finally: if not keep_files and centre_instance.is_download_dir_walkable: centre_instance.clean_up() logger.info(f"Import complete in {round(time.time() - start, 2)}s") logger.info("=" * 80) except Exception as e: logger.exception(e)
import logging import logging.config import sys from crawler.helpers.general_helpers import get_config from migrations import ( sample_timestamps, update_dart, update_filtered_positives, update_legacy_filtered_positives, update_mlwh_with_legacy_samples, ) config, settings_module = get_config("") logger = logging.getLogger(__name__) config.LOGGING["loggers"]["crawler"]["level"] = "DEBUG" config.LOGGING["loggers"]["crawler"]["handlers"] = ["colored_stream"] config.LOGGING["formatters"]["colored"][ "format"] = "%(asctime)-15s %(name)-60s:%(lineno)-3s %(log_color)s%(levelname)-7s %(message)s" logging.config.dictConfig(config.LOGGING) ## # Examples of how to run from command line: # python run_migration.py sample_timestamps # python run_migration.py update_mlwh_with_legacy_samples 200115_1200 200216_0900 # python run_migration.py update_mlwh_and_dart_with_legacy_samples 200115_1200 200216_0900 # python run_migration.py update_filtered_positives ##
from crawler.helpers.db_helpers import ensure_mongo_collections_indexed from crawler.helpers.general_helpers import get_config, get_sftp_connection from tests.testing_objects import ( EVENT_WH_DATA, FILTERED_POSITIVE_TESTING_SAMPLES, MLWH_SAMPLE_LIGHTHOUSE_SAMPLE, MLWH_SAMPLE_STOCK_RESOURCE, MLWH_SAMPLES_WITH_FILTERED_POSITIVE_FIELDS, MONGO_SAMPLES_WITH_FILTERED_POSITIVE_FIELDS, MONGO_SAMPLES_WITHOUT_FILTERED_POSITIVE_FIELDS, TESTING_PRIORITY_SAMPLES, TESTING_SAMPLES, ) logger = logging.getLogger(__name__) CONFIG, _ = get_config("crawler.config.test") logging.config.dictConfig(CONFIG.LOGGING) @pytest.fixture def app(): app = create_app("crawler.config.test") yield app @pytest.fixture def client(app): return app.test_client() @pytest.fixture