Exemplo n.º 1
0
def mysql_sample_generator(config: Config,
                           query: str) -> Iterator[Dict[str, Any]]:
    with create_mysql_connection(config=config, readonly=True) as connection:
        with connection.cursor(dictionary=True, buffered=False) as cursor:
            cursor.execute(query)
            for row in cursor:
                yield row
Exemplo n.º 2
0
def mlwh_connection(config):
    mysql_conn = create_mysql_connection(config, readonly=False)
    try:
        cursor = mysql_conn.cursor()
        # clear any existing rows in the lighthouse sample table
        try:
            cursor.execute(f"TRUNCATE TABLE {config.MLWH_DB_DBNAME}.{MLWH_TABLE_NAME}")
            mysql_conn.commit()
        except Exception:
            pytest.fail("An exception occurred clearing the table")
        finally:
            cursor.close()
        yield mysql_conn
    finally:
        # close the connection
        mysql_conn.close()
def update_mlwh_filtered_positive_fields(config: Config,
                                         samples: List[SampleDoc]) -> bool:
    """Bulk updates sample filtered positive fields in the MLWH database

    Arguments:
        config {Config} -- application config specifying database details
        samples {List[Dict[str, str]]} -- the list of samples whose filtered positive fields should be updated

    Returns:
        bool -- whether the updates completed successfully
    """
    mysql_conn = create_mysql_connection(config, False)

    if mysql_conn is not None and mysql_conn.is_connected():
        mlwh_samples = [map_mongo_to_sql_common(sample) for sample in samples]
        run_mysql_executemany_query(
            mysql_conn, SQL_MLWH_MULTIPLE_FILTERED_POSITIVE_UPDATE,
            mlwh_samples)
        return True
    else:
        return False
def update_mlwh_filtered_positive_fields_batched(
        config: Config, samples: List[SampleDoc], version: str,
        update_timestamp: datetime) -> bool:
    """Bulk updates sample filtered positive fields in the MLWH database

    Arguments:
        config {Config} -- application config specifying database details
        samples {List[Dict[str, str]]} -- the list of samples whose filtered positive fields
        should be updated
        version {str} -- filtered positive version
        update_timestamp {datetime} -- time of filtered positive fields update

    Returns:
        bool -- whether the updates completed successfully
    """
    mysql_conn = create_mysql_connection(config, False)
    completed_successfully = False
    try:
        if mysql_conn is not None and mysql_conn.is_connected():
            num_samples = len(samples)
            ROWS_PER_QUERY = 15000
            samples_index = 0
            logger.debug(
                f"Attempting to update {num_samples} rows in the MLWH database in batches of {ROWS_PER_QUERY}"
            )
            while samples_index < num_samples:
                samples_batch = samples[samples_index:(
                    samples_index + ROWS_PER_QUERY)]  # noqa: E203
                mlwh_samples_batch = [
                    map_mongo_to_sql_common(sample) for sample in samples_batch
                ]

                filtered_positive_ids = []
                filtered_negative_ids = []
                for sample in mlwh_samples_batch:
                    if sample[MLWH_FILTERED_POSITIVE] is True:
                        filtered_positive_ids.append(sample[MLWH_MONGODB_ID])
                    else:
                        filtered_negative_ids.append(sample[MLWH_MONGODB_ID])

                filtered_positive_num = len(filtered_positive_ids)
                logger.info(
                    f"Attempting to update {filtered_positive_num} {version} filtered positive samples in MLWH"
                )

                if filtered_positive_num > 0:
                    positive_args: List[Any] = [
                        True, version, update_timestamp, update_timestamp
                    ]
                    run_mysql_execute_formatted_query(
                        mysql_conn,
                        SQL_MLWH_MULTIPLE_FILTERED_POSITIVE_UPDATE_BATCH,
                        filtered_positive_ids,
                        positive_args,
                    )

                filtered_negative_num = len(filtered_negative_ids)
                logger.info(
                    f"Attempting to update {filtered_negative_num} {version} filtered positive false samples in MLWH"
                )

                if filtered_negative_num > 0:
                    negative_args: List[Any] = [
                        False, version, update_timestamp, update_timestamp
                    ]
                    run_mysql_execute_formatted_query(
                        mysql_conn,
                        SQL_MLWH_MULTIPLE_FILTERED_POSITIVE_UPDATE_BATCH,
                        filtered_negative_ids,
                        negative_args,
                    )

                samples_index += ROWS_PER_QUERY
            completed_successfully = True
        return completed_successfully
    except Exception:
        logger.error("MLWH filtered positive field batched updates failed")
        raise
    finally:
        # close the connection
        logger.debug("Closing the MLWH database connection.")
        mysql_conn.close()
def migrate_all_dbs(config: Config,
                    s_start_datetime: str = "",
                    s_end_datetime: str = "") -> None:
    if not config:
        logger.error("Aborting run: Config required")
        return

    if not valid_datetime_string(s_start_datetime):
        logger.error(
            "Aborting run: Expected format of Start datetime is YYMMDD_HHmm")
        return

    if not valid_datetime_string(s_end_datetime):
        logger.error(
            "Aborting run: Expected format of End datetime is YYMMDD_HHmm")
        return

    start_datetime = datetime.strptime(s_start_datetime, MONGO_DATETIME_FORMAT)
    end_datetime = datetime.strptime(s_end_datetime, MONGO_DATETIME_FORMAT)

    if start_datetime > end_datetime:
        logger.error(
            "Aborting run: End datetime must be greater than Start datetime")
        return

    logger.info(
        f"Starting DART update process with Start datetime {start_datetime} and End datetime {end_datetime}"
    )

    try:
        mongo_docs_for_sql = []

        # open connection to mongo
        with create_mongo_client(config) as client:
            mongo_db = get_mongo_db(config, client)

            samples_collection = get_mongo_collection(mongo_db,
                                                      COLLECTION_SAMPLES)

            # 1. get samples from mongo between these time ranges
            samples = get_samples(samples_collection, start_datetime,
                                  end_datetime)

            if not samples:
                logger.info("No samples in this time range.")
                return

            logger.debug(f"{len(samples)} samples to process")

            root_sample_ids, plate_barcodes = extract_required_cp_info(samples)

            logger.debug(f"{len(plate_barcodes)} unique plate barcodes")

            # 2. of these, find which have been cherry-picked and remove them from the list
            cp_samples_df = get_cherrypicked_samples(config,
                                                     list(root_sample_ids),
                                                     list(plate_barcodes))

            if cp_samples_df is None:  # we need to check if it is None explicitly
                raise Exception(
                    "Unable to determine cherry-picked sample - potentially error connecting to MySQL"
                )

            # get the samples between those dates minus the cherry-picked ones
            if cp_samples_df is not None and not cp_samples_df.empty:
                # we need a list of cherry-picked samples with their respective plate barcodes
                cp_samples = cp_samples_df[[
                    FIELD_ROOT_SAMPLE_ID, FIELD_PLATE_BARCODE
                ]].to_numpy().tolist()

                logger.debug(
                    f"{len(cp_samples)} cherry-picked samples in this timeframe"
                )

                samples = remove_cherrypicked_samples(samples, cp_samples)
            else:
                logger.debug("No cherry-picked samples in this timeframe")

            logger.info(
                f"{len(samples)} samples between these timestamps and not cherry-picked"
            )

            # 3. add the UUID fields if not present
            add_sample_uuid_field(samples)

            # update the samples with source plate UUIDs
            samples_updated_with_source_plate_uuids(mongo_db, samples)

            # 4. update samples in mongo updated in either of the above two steps (would expect the same set of samples
            #       from both steps)
            logger.info("Updating Mongo...")
            _ = update_mongo_fields(mongo_db, samples)
            logger.info("Finished updating Mongo")

        # convert mongo field values into MySQL format
        for sample in samples:
            mongo_docs_for_sql.append(
                map_mongo_sample_to_mysql(sample, copy_date=True))

        if (num_sql_docs := len(mongo_docs_for_sql)) > 0:
            logger.info(
                f"Updating MLWH database for {num_sql_docs} sample documents")
            # create connection to the MLWH database
            with create_mysql_connection(config, False) as mlwh_conn:
                # 5. update the MLWH (should be an idempotent operation)
                run_mysql_executemany_query(mlwh_conn,
                                            SQL_MLWH_MULTIPLE_INSERT,
                                            mongo_docs_for_sql)

            # 6. add all the plates with non-cherrypicked samples (determined in step 2) to DART, as well as any
            #       positive samples in these plates
            update_dart_fields(config, samples)
        else:
Exemplo n.º 6
0
def test_create_mysql_connection_exception(config):
    # For example, if the credentials in the config are wrong
    with patch("mysql.connector.connect", side_effect=mysql.Error()):
        assert create_mysql_connection(config) is None
Exemplo n.º 7
0
def test_create_mysql_connection_none(config):
    with patch("mysql.connector.connect", return_value=None):
        assert create_mysql_connection(config) is None
def update_mlwh_with_legacy_samples(config: Config,
                                    s_start_datetime: str = "",
                                    s_end_datetime: str = "") -> None:
    if not valid_datetime_string(s_start_datetime):
        print("Aborting run: Expected format of Start datetime is YYMMDD_HHmm")
        return

    if not valid_datetime_string(s_end_datetime):
        print("Aborting run: Expected format of End datetime is YYMMDD_HHmm")
        return

    start_datetime = datetime.strptime(s_start_datetime, MONGO_DATETIME_FORMAT)
    end_datetime = datetime.strptime(s_end_datetime, MONGO_DATETIME_FORMAT)

    if start_datetime > end_datetime:
        print("Aborting run: End datetime must be greater than Start datetime")
        return

    print(
        f"Starting MLWH update process with Start datetime {start_datetime} and End datetime {end_datetime}"
    )

    try:
        mongo_docs_for_sql = []
        number_docs_found = 0

        # open connection mongo
        with create_mongo_client(config) as client:
            mongo_db = get_mongo_db(config, client)

            samples_collection = get_mongo_collection(mongo_db,
                                                      COLLECTION_SAMPLES)

            print("Selecting Mongo samples")

            # this should take everything from the cursor find into RAM memory (assuming you have
            # enough memory)
            mongo_docs = list(
                samples_collection.find({
                    FIELD_CREATED_AT: {
                        "$gte": start_datetime,
                        "$lte": end_datetime
                    }
                }))
            number_docs_found = len(mongo_docs)
            print(
                f"{number_docs_found} documents found in the mongo database between these timestamps"
            )

            # convert mongo field values into MySQL format
            for doc in mongo_docs:
                mongo_docs_for_sql.append(
                    map_mongo_sample_to_mysql(doc, copy_date=True))

        if number_docs_found > 0:
            print(
                f"Updating MLWH database for {len(mongo_docs_for_sql)} sample documents"
            )
            # create connection to the MLWH database
            with create_mysql_connection(config, False) as mlwh_conn:

                # execute sql query to insert/update timestamps into MLWH
                run_mysql_executemany_query(mlwh_conn,
                                            SQL_MLWH_MULTIPLE_INSERT,
                                            mongo_docs_for_sql)
        else:
            print(
                "No documents found for this timestamp range, nothing to insert or update in MLWH"
            )

    except Exception:
        print_exception()