def test_get_cherrypicked_samples_repeat_tests_sentinel_and_beckman( config, mlwh_sentinel_and_beckman_cherrypicked, event_wh_data): # the following come from MLWH_SAMPLE_STOCK_RESOURCE and MLWH_SAMPLE_LIGHTHOUSE_SAMPLE in test data root_sample_ids = [ "root_1", "root_2", "root_3", "root_4", "root_5", "root_6", "root_1" ] plate_barcodes = ["pb_1", "pb_3", "pb_4", "pb_5", "pb_6"] # root_1 will match 2 samples, but only one of those will match a Sentinel event (on pb_1) # root_2 will match a single sample with a matching Sentinel event, but excluded as plate pb_2 not included in query # root_3 will match a single sample with a matching Sentinel event (on pb_3) # root_4 will match 2 samples, but not match either a Sentinel or Beckman event # root_5 will match 2 samples, but only one of those will match a Beckman event (on pb_4) # root_6 will match a single sample with a matching Beckman event (on pb_5) # We also chunk to further test different scenarios expected_rows = [ ["root_1", "pb_1", "positive", "A1"], ["root_3", "pb_3", "positive", "A1"], ["root_5", "pb_4", "positive", "A1"], ["root_6", "pb_5", "positive", "A1"], ] expected_columns = [ FIELD_ROOT_SAMPLE_ID, FIELD_PLATE_BARCODE, "Result_lower", FIELD_COORDINATE ] expected = pd.DataFrame(np.array(expected_rows), columns=expected_columns, index=[0, 1, 2, 3]) chunk_size = 2 returned_samples = get_cherrypicked_samples(config, root_sample_ids, plate_barcodes, chunk_size) pd.testing.assert_frame_equal(expected, returned_samples)
def test_get_cherrypicked_samples_chunking_sentinel_and_beckman(config): # Note: This represents the results of three different (Sentinel, Beckman) sets of # database queries, each query getting indexed from 0. Do not change the # indices here unless you have modified the behaviour of the query. query_results = [ pd.DataFrame(["MCM001"], columns=[FIELD_ROOT_SAMPLE_ID], index=[0]), # Sentinel query response pd.DataFrame(["MCM001", "MCM002"], columns=[FIELD_ROOT_SAMPLE_ID], index=[0, 1]), # Beckman query response pd.DataFrame(["MCM003"], columns=[FIELD_ROOT_SAMPLE_ID], index=[0]), # Sentinel query response pd.DataFrame(["MCM003", "MCM004"], columns=[FIELD_ROOT_SAMPLE_ID], index=[0, 1]), # Beckman query response pd.DataFrame(["MCM005"], columns=[FIELD_ROOT_SAMPLE_ID], index=[0]), # Sentinel query response pd.DataFrame(["MCM005", "MCM006"], columns=[FIELD_ROOT_SAMPLE_ID], index=[0, 1]), # Beckman query response ] expected = pd.DataFrame( ["MCM001", "MCM002", "MCM003", "MCM004", "MCM005", "MCM006"], columns=[FIELD_ROOT_SAMPLE_ID], index=[0, 1, 2, 3, 4, 5], ) samples = ["MCM001", "MCM002", "MCM003", "MCM004", "MCM005"] plate_barcodes = ["123", "456"] with patch("sqlalchemy.create_engine", return_value=Mock()): with patch("pandas.read_sql", side_effect=query_results): returned_samples = get_cherrypicked_samples( config, samples, plate_barcodes, 2) pd.testing.assert_frame_equal(expected, returned_samples)
def test_get_cherrypicked_samples_sentinel_and_beckman(config): expected = [ # Sentinel query response pd.DataFrame(["MCM001", "MCM006"], columns=[FIELD_ROOT_SAMPLE_ID], index=[0, 1]), # Beckman query response pd.DataFrame(["MCM001", "MCM003", "MCM005"], columns=[FIELD_ROOT_SAMPLE_ID], index=[0, 1, 2]), ] samples = ["MCM001", "MCM002", "MCM003", "MCM004", "MCM005", "MCM006"] plate_barcodes = ["123", "456"] with patch("sqlalchemy.create_engine", return_value=Mock()): with patch("pandas.read_sql", side_effect=expected): returned_samples = get_cherrypicked_samples( config, samples, plate_barcodes) assert returned_samples.at[ 0, FIELD_ROOT_SAMPLE_ID] == "MCM001" # type: ignore assert returned_samples.at[ 1, FIELD_ROOT_SAMPLE_ID] == "MCM006" # type: ignore assert returned_samples.at[ 2, FIELD_ROOT_SAMPLE_ID] == "MCM003" # type: ignore assert returned_samples.at[ 3, FIELD_ROOT_SAMPLE_ID] == "MCM005" # type: ignore
def test_get_cherrypicked_samples_repeat_tests_no_beckman( config, mlwh_sentinel_cherrypicked, event_wh_data): # the following come from MLWH_SAMPLE_STOCK_RESOURCE in test data root_sample_ids = ["root_1", "root_2", "root_3", "root_1"] plate_barcodes = ["pb_1", "pb_2", "pb_3"] # root_1 will match 2 samples, but only one of those will match an event (on Sanger Sample Id) # therefore we only get 1 of the samples called 'root_1' back (the one on plate 'pb_1') # this also checks we don't get a duplicate row for root_1 / pb_1, despite it cropped up in 2 # different 'chunks' expected_rows = [ ["root_1", "pb_1", "positive", "A1"], ["root_2", "pb_2", "positive", "A1"], ["root_3", "pb_3", "positive", "A1"], ] expected_columns = [ FIELD_ROOT_SAMPLE_ID, FIELD_PLATE_BARCODE, "Result_lower", FIELD_COORDINATE ] expected = pd.DataFrame(np.array(expected_rows), columns=expected_columns, index=[0, 1, 2]) chunk_size = 2 returned_samples = get_cherrypicked_samples(config, root_sample_ids, plate_barcodes, chunk_size) pd.testing.assert_frame_equal(expected, returned_samples)
def remove_cherrypicked_samples(config: Config, samples: List[SampleDoc]) -> List[SampleDoc]: """Filters an input list of samples for those that have not been cherrypicked. Arguments: config {Config} -- application config specifying database details samples {List[Sample]} -- the list of samples to filter Returns: List[Sample] -- non-cherrypicked samples """ root_sample_ids, plate_barcodes = extract_required_cp_info(samples) cp_samples_df = get_cherrypicked_samples(config, list(root_sample_ids), list(plate_barcodes)) if cp_samples_df is None: raise Exception( "Unable to determine cherry-picked samples - potentially error connecting to MySQL" ) elif not cp_samples_df.empty: cp_samples = cp_samples_df[[FIELD_ROOT_SAMPLE_ID, FIELD_PLATE_BARCODE]].to_numpy().tolist() return remove_cp_samples(samples, cp_samples) else: return samples
def migrate_all_dbs(config: Config, s_start_datetime: str = "", s_end_datetime: str = "") -> None: if not config: logger.error("Aborting run: Config required") return if not valid_datetime_string(s_start_datetime): logger.error( "Aborting run: Expected format of Start datetime is YYMMDD_HHmm") return if not valid_datetime_string(s_end_datetime): logger.error( "Aborting run: Expected format of End datetime is YYMMDD_HHmm") return start_datetime = datetime.strptime(s_start_datetime, MONGO_DATETIME_FORMAT) end_datetime = datetime.strptime(s_end_datetime, MONGO_DATETIME_FORMAT) if start_datetime > end_datetime: logger.error( "Aborting run: End datetime must be greater than Start datetime") return logger.info( f"Starting DART update process with Start datetime {start_datetime} and End datetime {end_datetime}" ) try: mongo_docs_for_sql = [] # open connection to mongo with create_mongo_client(config) as client: mongo_db = get_mongo_db(config, client) samples_collection = get_mongo_collection(mongo_db, COLLECTION_SAMPLES) # 1. get samples from mongo between these time ranges samples = get_samples(samples_collection, start_datetime, end_datetime) if not samples: logger.info("No samples in this time range.") return logger.debug(f"{len(samples)} samples to process") root_sample_ids, plate_barcodes = extract_required_cp_info(samples) logger.debug(f"{len(plate_barcodes)} unique plate barcodes") # 2. of these, find which have been cherry-picked and remove them from the list cp_samples_df = get_cherrypicked_samples(config, list(root_sample_ids), list(plate_barcodes)) if cp_samples_df is None: # we need to check if it is None explicitly raise Exception( "Unable to determine cherry-picked sample - potentially error connecting to MySQL" ) # get the samples between those dates minus the cherry-picked ones if cp_samples_df is not None and not cp_samples_df.empty: # we need a list of cherry-picked samples with their respective plate barcodes cp_samples = cp_samples_df[[ FIELD_ROOT_SAMPLE_ID, FIELD_PLATE_BARCODE ]].to_numpy().tolist() logger.debug( f"{len(cp_samples)} cherry-picked samples in this timeframe" ) samples = remove_cherrypicked_samples(samples, cp_samples) else: logger.debug("No cherry-picked samples in this timeframe") logger.info( f"{len(samples)} samples between these timestamps and not cherry-picked" ) # 3. add the UUID fields if not present add_sample_uuid_field(samples) # update the samples with source plate UUIDs samples_updated_with_source_plate_uuids(mongo_db, samples) # 4. update samples in mongo updated in either of the above two steps (would expect the same set of samples # from both steps) logger.info("Updating Mongo...") _ = update_mongo_fields(mongo_db, samples) logger.info("Finished updating Mongo") # convert mongo field values into MySQL format for sample in samples: mongo_docs_for_sql.append( map_mongo_sample_to_mysql(sample, copy_date=True)) if (num_sql_docs := len(mongo_docs_for_sql)) > 0: logger.info( f"Updating MLWH database for {num_sql_docs} sample documents") # create connection to the MLWH database with create_mysql_connection(config, False) as mlwh_conn: # 5. update the MLWH (should be an idempotent operation) run_mysql_executemany_query(mlwh_conn, SQL_MLWH_MULTIPLE_INSERT, mongo_docs_for_sql) # 6. add all the plates with non-cherrypicked samples (determined in step 2) to DART, as well as any # positive samples in these plates update_dart_fields(config, samples) else: