Python get_cherrypicked_samples 예제들, crawler.helpers.cherrypicked_samples.get_cherrypicked_samples Python 예제들

예제 #1

0

파일 보기

파일: test_cherrypicked_samples.py 프로젝트: sanger/crawler

def test_get_cherrypicked_samples_repeat_tests_sentinel_and_beckman(config, mlwh_cherrypicked_samples, event_wh_data):
    """
    Test Scenario
    - Actual database responses
    - cherrypicked_samples query returns matches
    - Chunking: multiple queries are made, with all matches contained in the sum of these queries
    - Duplication of returned matches across different chunks: duplicates should be filtered out
    """

    # the following come from MLWH_SAMPLE_STOCK_RESOURCE and MLWH_SAMPLE_LIGHTHOUSE_SAMPLE in test data
    root_sample_ids = ["root_1", "root_2", "root_3", "root_4", "root_5", "root_6", "root_1"]
    plate_barcodes = ["pb_1", "pb_3", "pb_4", "pb_5", "pb_6"]

    # root_1 will match 2 samples, but only one of those will match a Sentinel event (on pb_1)
    # root_2 will match a single sample with a matching Sentinel event,
    # but excluded as plate pb_2 not included in query
    # root_3 will match a single sample with a matching Sentinel event (on pb_3)
    # root_4 will match 2 samples, but not match either a Sentinel or Beckman event
    # root_5 will match 2 samples, but only one of those will match a Beckman event (on pb_4)
    # root_6 will match a single sample with a matching Beckman event (on pb_5)
    # We also chunk to further test different scenarios
    expected_rows = [
        ["root_1", "pb_1", "positive", "A1"],
        ["root_3", "pb_3", "positive", "A1"],
        ["root_5", "pb_4", "positive", "A1"],
        ["root_6", "pb_5", "positive", "A1"],
    ]
    expected_columns = [FIELD_ROOT_SAMPLE_ID, FIELD_PLATE_BARCODE, "Result_lower", FIELD_COORDINATE]
    expected = pd.DataFrame(np.array(expected_rows), columns=expected_columns, index=[0, 1, 2, 3])

    chunk_size = 2
    returned_samples = get_cherrypicked_samples(config, root_sample_ids, plate_barcodes, chunk_size)
    pd.testing.assert_frame_equal(expected, returned_samples)

예제 #2

0

파일 보기

파일: test_cherrypicked_samples.py 프로젝트: sanger/crawler

def test_get_cherrypicked_samples_sentinel_and_beckman(config):
    """
    Test Scenario
    - Mocking database responses
    - Both Sentinel and Beckman queries return matches
    - No chunking: a single query is made (per workflow) in which all matches are returned
    - Duplication of returned matches across different workflows: duplicates should be filtered out
    """

    expected = [
        # Cherrypicking query response
        pd.DataFrame(
            [
                "MCM001",  # Sentinel
                "MCM006",  # Sentinel
                "MCM001",  # Beckman
                "MCM003",  # Beckman
                "MCM005",  # Beckman
            ],
            columns=[FIELD_ROOT_SAMPLE_ID],
            index=[0, 1, 2, 3, 4],
        ),
    ]
    samples = ["MCM001", "MCM002", "MCM003", "MCM004", "MCM005", "MCM006"]
    plate_barcodes = ["123", "456"]

    with patch("sqlalchemy.create_engine", return_value=Mock()):
        with patch("pandas.read_sql", side_effect=expected):
            returned_samples = get_cherrypicked_samples(config, samples, plate_barcodes)
            assert returned_samples is not None
            assert returned_samples.at[0, FIELD_ROOT_SAMPLE_ID] == "MCM001"
            assert returned_samples.at[1, FIELD_ROOT_SAMPLE_ID] == "MCM006"
            assert returned_samples.at[2, FIELD_ROOT_SAMPLE_ID] == "MCM003"
            assert returned_samples.at[3, FIELD_ROOT_SAMPLE_ID] == "MCM005"

예제 #3

0

파일 보기

파일: test_cherrypicked_samples.py 프로젝트: sanger/crawler

def test_get_cherrypicked_samples_chunking_no_sentinel(config):
    """
    Test Scenario
    - Mocking database responses
    - Only the Beckman queries return matches (No Sentinel)
    - Chunking: multiple queries are made, with all matches contained in the sum of these queries
    - No duplication of returned matches
    """

    # Note: This represents the results of three different (Sentinel, Beckman) sets of
    # database queries, each Beckman query getting indexed from 0. Do not change the
    # indices here unless you have modified the behaviour of the query.
    query_results = [
        pd.DataFrame(["MCM001"], columns=[FIELD_ROOT_SAMPLE_ID], index=[0]),  # Cherypicking query response
        pd.DataFrame(["MCM003"], columns=[FIELD_ROOT_SAMPLE_ID], index=[0]),  # Cherypicking query response
        pd.DataFrame(["MCM005"], columns=[FIELD_ROOT_SAMPLE_ID], index=[0]),  # Cherypicking query response
    ]
    expected = pd.DataFrame(["MCM001", "MCM003", "MCM005"], columns=[FIELD_ROOT_SAMPLE_ID], index=[0, 1, 2])

    samples = ["MCM001", "MCM002", "MCM003", "MCM004", "MCM005"]
    plate_barcodes = ["123", "456"]

    with patch("sqlalchemy.create_engine", return_value=Mock()):
        with patch("pandas.read_sql", side_effect=query_results):
            returned_samples = get_cherrypicked_samples(config, samples, plate_barcodes, 2)
            pd.testing.assert_frame_equal(expected, returned_samples)

예제 #4

0

파일 보기

파일: test_cherrypicked_samples.py 프로젝트: sanger/crawler

def test_get_cherrypicked_samples_repeat_tests_no_beckman(config, mlwh_sentinel_cherrypicked, event_wh_data):
    """
    Test Scenario
    - Actual database responses
    - Only the Sentinel queries return matches (No Beckman)
    - Chunking: multiple queries are made, with all matches contained in the sum of these queries
    - Duplication of returned matches across different chunks: duplicates should be filtered out
    """

    # the following come from MLWH_SAMPLE_STOCK_RESOURCE in test data
    root_sample_ids = ["root_1", "root_2", "root_3", "root_1"]
    plate_barcodes = ["pb_1", "pb_2", "pb_3"]

    # root_1 will match 2 samples, but only one of those will match an event (on Sanger Sample Id)
    # therefore we only get 1 of the samples called 'root_1' back (the one on plate 'pb_1')
    # this also checks we don't get a duplicate row for root_1 / pb_1, despite it cropped up in 2
    # different 'chunks'
    expected_rows = [
        ["root_1", "pb_1", "positive", "A1"],
        ["root_2", "pb_2", "positive", "A1"],
        ["root_3", "pb_3", "positive", "A1"],
    ]
    expected_columns = [FIELD_ROOT_SAMPLE_ID, FIELD_PLATE_BARCODE, "Result_lower", FIELD_COORDINATE]
    expected = pd.DataFrame(np.array(expected_rows), columns=expected_columns, index=[0, 1, 2])

    chunk_size = 2
    returned_samples = get_cherrypicked_samples(config, root_sample_ids, plate_barcodes, chunk_size)
    pd.testing.assert_frame_equal(expected, returned_samples)

예제 #5

0

파일 보기

파일: test_cherrypicked_samples.py 프로젝트: sanger/crawler

def test_get_cherrypicked_samples_chunking_sentinel_and_beckman(config):
    """
    Test Scenario
    - Mocking database responses
    - Both Sentinel and Beckman queries return matches
    - Chunking: multiple queries are made (per workflow), with all matches contained in the sum
    - Duplication of returned matches across different workflows: duplicates should be filtered out
    """

    # Note: This represents the results of three different (Sentinel, Beckman) sets of
    # database queries, each query getting indexed from 0. Do not change the
    # indices here unless you have modified the behaviour of the query.

    query_results = [
        pd.DataFrame(
            [
                "MCM001",
                "MCM001",
                "MCM002",
            ],  # Sentinel, Beckman, Beckman
            columns=[FIELD_ROOT_SAMPLE_ID],
            index=[0, 1, 2],
        ),
        pd.DataFrame(
            [
                "MCM003",
                "MCM003",
                "MCM004",
            ],  # Sentinel, Beckman, Beckman
            columns=[FIELD_ROOT_SAMPLE_ID],
            index=[0, 1, 2],
        ),
        pd.DataFrame(
            [
                "MCM005",
                "MCM005",
                "MCM006",
            ],  # Sentinel, Beckman, Beckman
            columns=[FIELD_ROOT_SAMPLE_ID],
            index=[0, 1, 2],
        ),
    ]
    expected = pd.DataFrame(
        ["MCM001", "MCM002", "MCM003", "MCM004", "MCM005", "MCM006"],
        columns=[FIELD_ROOT_SAMPLE_ID],
        index=[0, 1, 2, 3, 4, 5],
    )

    samples = ["MCM001", "MCM002", "MCM003", "MCM004", "MCM005", "MCM006"]
    plate_barcodes = ["123", "456"]

    with patch("sqlalchemy.create_engine", return_value=Mock()):
        with patch("pandas.read_sql", side_effect=query_results):
            returned_samples = get_cherrypicked_samples(config, samples, plate_barcodes, 2)
            pd.testing.assert_frame_equal(expected, returned_samples)

예제 #6

0

파일 보기

파일: test_cherrypicked_samples.py 프로젝트: sanger/crawler

def test_get_cherrypicked_samples_no_beckman(config):
    """
    Test Scenario
    - Mocking database responses
    - Only the Sentinel query returns matches (No Beckman)
    - No chunking: a single query is made in which all matches are returned
    - No duplication of returned matches
    """
    expected = [
        # Cherrypicking query response
        pd.DataFrame(["MCM001", "MCM003", "MCM005"], columns=[FIELD_ROOT_SAMPLE_ID], index=[0, 1, 2]),
    ]
    samples = ["MCM001", "MCM002", "MCM003", "MCM004", "MCM005"]
    plate_barcodes = ["123", "456"]

    with patch("sqlalchemy.create_engine", return_value=Mock()):
        with patch("pandas.read_sql", side_effect=expected):
            returned_samples = get_cherrypicked_samples(config, samples, plate_barcodes)
            assert returned_samples.at[0, FIELD_ROOT_SAMPLE_ID] == "MCM001"  # type: ignore
            assert returned_samples.at[1, FIELD_ROOT_SAMPLE_ID] == "MCM003"  # type: ignore
            assert returned_samples.at[2, FIELD_ROOT_SAMPLE_ID] == "MCM005"  # type: ignore

예제 #7

0

파일 보기

def migrate_all_dbs(config: Config,
                    s_start_datetime: str = "",
                    s_end_datetime: str = "") -> None:
    if not config:
        logger.error("Aborting run: Config required")
        return

    if not valid_datetime_string(s_start_datetime):
        logger.error(
            "Aborting run: Expected format of Start datetime is YYMMDD_HHmm")
        return

    if not valid_datetime_string(s_end_datetime):
        logger.error(
            "Aborting run: Expected format of End datetime is YYMMDD_HHmm")
        return

    start_datetime = datetime.strptime(s_start_datetime, MONGO_DATETIME_FORMAT)
    end_datetime = datetime.strptime(s_end_datetime, MONGO_DATETIME_FORMAT)

    if start_datetime > end_datetime:
        logger.error(
            "Aborting run: End datetime must be greater than Start datetime")
        return

    logger.info(
        f"Starting DART update process with Start datetime {start_datetime} and End datetime {end_datetime}"
    )

    try:
        mongo_docs_for_sql = []

        # open connection to mongo
        with create_mongo_client(config) as client:
            mongo_db = get_mongo_db(config, client)

            samples_collection = get_mongo_collection(mongo_db,
                                                      COLLECTION_SAMPLES)

            # 1. get samples from mongo between these time ranges
            samples = get_samples(samples_collection, start_datetime,
                                  end_datetime)

            if not samples:
                logger.info("No samples in this time range.")
                return

            logger.debug(f"{len(samples)} samples to process")

            root_sample_ids, plate_barcodes = extract_required_cp_info(samples)

            logger.debug(f"{len(plate_barcodes)} unique plate barcodes")

            # 2. of these, find which have been cherry-picked and remove them from the list
            cp_samples_df = get_cherrypicked_samples(config,
                                                     list(root_sample_ids),
                                                     list(plate_barcodes))

            if cp_samples_df is None:  # we need to check if it is None explicitly
                raise Exception(
                    "Unable to determine cherry-picked sample - potentially error connecting to MySQL"
                )

            # get the samples between those dates minus the cherry-picked ones
            if cp_samples_df is not None and not cp_samples_df.empty:
                # we need a list of cherry-picked samples with their respective plate barcodes
                cp_samples = cp_samples_df[[
                    FIELD_ROOT_SAMPLE_ID, FIELD_PLATE_BARCODE
                ]].to_numpy().tolist()

                logger.debug(
                    f"{len(cp_samples)} cherry-picked samples in this timeframe"
                )

                samples = remove_cherrypicked_samples(samples, cp_samples)
            else:
                logger.debug("No cherry-picked samples in this timeframe")

            logger.info(
                f"{len(samples)} samples between these timestamps and not cherry-picked"
            )

            # 3. add the UUID fields if not present
            add_sample_uuid_field(samples)

            # update the samples with source plate UUIDs
            samples_updated_with_source_plate_uuids(mongo_db, samples)

            # 4. update samples in mongo updated in either of the above two steps (would expect the same set of samples
            #       from both steps)
            logger.info("Updating Mongo...")
            _ = update_mongo_fields(mongo_db, samples)
            logger.info("Finished updating Mongo")

        # convert mongo field values into MySQL format
        for sample in samples:
            mongo_docs_for_sql.append(
                map_mongo_sample_to_mysql(sample, copy_date=True))

        mysql_samples = set_is_current_on_mysql_samples(mongo_docs_for_sql)

        if (num_sql_docs := len(mysql_samples)) > 0:
            logger.info(
                f"Updating MLWH database for {num_sql_docs} sample documents")
            # create connection to the MLWH database
            with create_mysql_connection(config, False) as mlwh_conn:
                # 5. update the MLWH (should be an idempotent operation)

                # TODO: Check here would migration dbs be ok?
                run_mysql_executemany_query(mlwh_conn,
                                            SQL_MLWH_MULTIPLE_INSERT,
                                            mysql_samples)

            # 6. add all the plates with non-cherrypicked samples (determined in step 2) to DART, as well as any
            #       positive samples in these plates
            update_dart_fields(config, samples)
        else: