def mysql_sample_generator(config: Config, query: str) -> Iterator[Dict[str, Any]]: with create_mysql_connection(config=config, readonly=True) as connection: with connection.cursor(dictionary=True, buffered=False) as cursor: cursor.execute(query) for row in cursor: yield row
def mlwh_connection(config): mysql_conn = create_mysql_connection(config, readonly=False) try: cursor = mysql_conn.cursor() # clear any existing rows in the lighthouse sample table try: cursor.execute(f"TRUNCATE TABLE {config.MLWH_DB_DBNAME}.{MLWH_TABLE_NAME}") mysql_conn.commit() except Exception: pytest.fail("An exception occurred clearing the table") finally: cursor.close() yield mysql_conn finally: # close the connection mysql_conn.close()
def update_mlwh_filtered_positive_fields(config: Config, samples: List[SampleDoc]) -> bool: """Bulk updates sample filtered positive fields in the MLWH database Arguments: config {Config} -- application config specifying database details samples {List[Dict[str, str]]} -- the list of samples whose filtered positive fields should be updated Returns: bool -- whether the updates completed successfully """ mysql_conn = create_mysql_connection(config, False) if mysql_conn is not None and mysql_conn.is_connected(): mlwh_samples = [map_mongo_to_sql_common(sample) for sample in samples] run_mysql_executemany_query( mysql_conn, SQL_MLWH_MULTIPLE_FILTERED_POSITIVE_UPDATE, mlwh_samples) return True else: return False
def update_mlwh_filtered_positive_fields_batched( config: Config, samples: List[SampleDoc], version: str, update_timestamp: datetime) -> bool: """Bulk updates sample filtered positive fields in the MLWH database Arguments: config {Config} -- application config specifying database details samples {List[Dict[str, str]]} -- the list of samples whose filtered positive fields should be updated version {str} -- filtered positive version update_timestamp {datetime} -- time of filtered positive fields update Returns: bool -- whether the updates completed successfully """ mysql_conn = create_mysql_connection(config, False) completed_successfully = False try: if mysql_conn is not None and mysql_conn.is_connected(): num_samples = len(samples) ROWS_PER_QUERY = 15000 samples_index = 0 logger.debug( f"Attempting to update {num_samples} rows in the MLWH database in batches of {ROWS_PER_QUERY}" ) while samples_index < num_samples: samples_batch = samples[samples_index:( samples_index + ROWS_PER_QUERY)] # noqa: E203 mlwh_samples_batch = [ map_mongo_to_sql_common(sample) for sample in samples_batch ] filtered_positive_ids = [] filtered_negative_ids = [] for sample in mlwh_samples_batch: if sample[MLWH_FILTERED_POSITIVE] is True: filtered_positive_ids.append(sample[MLWH_MONGODB_ID]) else: filtered_negative_ids.append(sample[MLWH_MONGODB_ID]) filtered_positive_num = len(filtered_positive_ids) logger.info( f"Attempting to update {filtered_positive_num} {version} filtered positive samples in MLWH" ) if filtered_positive_num > 0: positive_args: List[Any] = [ True, version, update_timestamp, update_timestamp ] run_mysql_execute_formatted_query( mysql_conn, SQL_MLWH_MULTIPLE_FILTERED_POSITIVE_UPDATE_BATCH, filtered_positive_ids, positive_args, ) filtered_negative_num = len(filtered_negative_ids) logger.info( f"Attempting to update {filtered_negative_num} {version} filtered positive false samples in MLWH" ) if filtered_negative_num > 0: negative_args: List[Any] = [ False, version, update_timestamp, update_timestamp ] run_mysql_execute_formatted_query( mysql_conn, SQL_MLWH_MULTIPLE_FILTERED_POSITIVE_UPDATE_BATCH, filtered_negative_ids, negative_args, ) samples_index += ROWS_PER_QUERY completed_successfully = True return completed_successfully except Exception: logger.error("MLWH filtered positive field batched updates failed") raise finally: # close the connection logger.debug("Closing the MLWH database connection.") mysql_conn.close()
def migrate_all_dbs(config: Config, s_start_datetime: str = "", s_end_datetime: str = "") -> None: if not config: logger.error("Aborting run: Config required") return if not valid_datetime_string(s_start_datetime): logger.error( "Aborting run: Expected format of Start datetime is YYMMDD_HHmm") return if not valid_datetime_string(s_end_datetime): logger.error( "Aborting run: Expected format of End datetime is YYMMDD_HHmm") return start_datetime = datetime.strptime(s_start_datetime, MONGO_DATETIME_FORMAT) end_datetime = datetime.strptime(s_end_datetime, MONGO_DATETIME_FORMAT) if start_datetime > end_datetime: logger.error( "Aborting run: End datetime must be greater than Start datetime") return logger.info( f"Starting DART update process with Start datetime {start_datetime} and End datetime {end_datetime}" ) try: mongo_docs_for_sql = [] # open connection to mongo with create_mongo_client(config) as client: mongo_db = get_mongo_db(config, client) samples_collection = get_mongo_collection(mongo_db, COLLECTION_SAMPLES) # 1. get samples from mongo between these time ranges samples = get_samples(samples_collection, start_datetime, end_datetime) if not samples: logger.info("No samples in this time range.") return logger.debug(f"{len(samples)} samples to process") root_sample_ids, plate_barcodes = extract_required_cp_info(samples) logger.debug(f"{len(plate_barcodes)} unique plate barcodes") # 2. of these, find which have been cherry-picked and remove them from the list cp_samples_df = get_cherrypicked_samples(config, list(root_sample_ids), list(plate_barcodes)) if cp_samples_df is None: # we need to check if it is None explicitly raise Exception( "Unable to determine cherry-picked sample - potentially error connecting to MySQL" ) # get the samples between those dates minus the cherry-picked ones if cp_samples_df is not None and not cp_samples_df.empty: # we need a list of cherry-picked samples with their respective plate barcodes cp_samples = cp_samples_df[[ FIELD_ROOT_SAMPLE_ID, FIELD_PLATE_BARCODE ]].to_numpy().tolist() logger.debug( f"{len(cp_samples)} cherry-picked samples in this timeframe" ) samples = remove_cherrypicked_samples(samples, cp_samples) else: logger.debug("No cherry-picked samples in this timeframe") logger.info( f"{len(samples)} samples between these timestamps and not cherry-picked" ) # 3. add the UUID fields if not present add_sample_uuid_field(samples) # update the samples with source plate UUIDs samples_updated_with_source_plate_uuids(mongo_db, samples) # 4. update samples in mongo updated in either of the above two steps (would expect the same set of samples # from both steps) logger.info("Updating Mongo...") _ = update_mongo_fields(mongo_db, samples) logger.info("Finished updating Mongo") # convert mongo field values into MySQL format for sample in samples: mongo_docs_for_sql.append( map_mongo_sample_to_mysql(sample, copy_date=True)) if (num_sql_docs := len(mongo_docs_for_sql)) > 0: logger.info( f"Updating MLWH database for {num_sql_docs} sample documents") # create connection to the MLWH database with create_mysql_connection(config, False) as mlwh_conn: # 5. update the MLWH (should be an idempotent operation) run_mysql_executemany_query(mlwh_conn, SQL_MLWH_MULTIPLE_INSERT, mongo_docs_for_sql) # 6. add all the plates with non-cherrypicked samples (determined in step 2) to DART, as well as any # positive samples in these plates update_dart_fields(config, samples) else:
def test_create_mysql_connection_exception(config): # For example, if the credentials in the config are wrong with patch("mysql.connector.connect", side_effect=mysql.Error()): assert create_mysql_connection(config) is None
def test_create_mysql_connection_none(config): with patch("mysql.connector.connect", return_value=None): assert create_mysql_connection(config) is None
def update_mlwh_with_legacy_samples(config: Config, s_start_datetime: str = "", s_end_datetime: str = "") -> None: if not valid_datetime_string(s_start_datetime): print("Aborting run: Expected format of Start datetime is YYMMDD_HHmm") return if not valid_datetime_string(s_end_datetime): print("Aborting run: Expected format of End datetime is YYMMDD_HHmm") return start_datetime = datetime.strptime(s_start_datetime, MONGO_DATETIME_FORMAT) end_datetime = datetime.strptime(s_end_datetime, MONGO_DATETIME_FORMAT) if start_datetime > end_datetime: print("Aborting run: End datetime must be greater than Start datetime") return print( f"Starting MLWH update process with Start datetime {start_datetime} and End datetime {end_datetime}" ) try: mongo_docs_for_sql = [] number_docs_found = 0 # open connection mongo with create_mongo_client(config) as client: mongo_db = get_mongo_db(config, client) samples_collection = get_mongo_collection(mongo_db, COLLECTION_SAMPLES) print("Selecting Mongo samples") # this should take everything from the cursor find into RAM memory (assuming you have # enough memory) mongo_docs = list( samples_collection.find({ FIELD_CREATED_AT: { "$gte": start_datetime, "$lte": end_datetime } })) number_docs_found = len(mongo_docs) print( f"{number_docs_found} documents found in the mongo database between these timestamps" ) # convert mongo field values into MySQL format for doc in mongo_docs: mongo_docs_for_sql.append( map_mongo_sample_to_mysql(doc, copy_date=True)) if number_docs_found > 0: print( f"Updating MLWH database for {len(mongo_docs_for_sql)} sample documents" ) # create connection to the MLWH database with create_mysql_connection(config, False) as mlwh_conn: # execute sql query to insert/update timestamps into MLWH run_mysql_executemany_query(mlwh_conn, SQL_MLWH_MULTIPLE_INSERT, mongo_docs_for_sql) else: print( "No documents found for this timestamp range, nothing to insert or update in MLWH" ) except Exception: print_exception()