def test_search(mock_fetch_formulas, spark_context, ds_config):
    formulas = ['H2O', 'C5H3O']
    adducts = ds_config['isotope_generation']['adducts']
    mock_fetch_formulas.side_effect = lambda moldb_id: formulas
    with TemporaryDirectory() as tmpdir:
        ds_data_path = Path(tmpdir)

        msm_search = MSMSearch(
            spark_context,
            make_imzml_reader_mock(),
            [MolecularDB(0, 'tests_db', 'version', targeted=True)],
            ds_config,
            ds_data_path,
            NullProfiler(),
        )
        msm_search._fetch_formula_centroids = make_fetch_formula_centroids_mock()

        msm_search.process_segments = lambda centr_segm_n, func: spark_context.parallelize(
            map(func, range(centr_segm_n))
        )

        moldb_ion_metrics_df, moldb_ion_images_rdd, fdr_bundle = next(msm_search.search())

        assert len(moldb_ion_metrics_df) == len(formulas) * len(adducts)
        assert moldb_ion_images_rdd.count() == len(formulas) * len(adducts)
        sanity_check_fdr_diagnostics(fdr_bundle)
def test_imzml_reader_tic_image_from_spectra():
    imzml_reader = make_imzml_reader_mock(coordinates=MOCK_COORDINATES, spectra=MOCK_SPECTRA)
    # Ensure all spectra have been read before getting the TIC image
    for _ in imzml_reader.iter_spectra(np.arange(4)):
        pass

    assert np.array_equal(imzml_reader.tic_image(), EXPECTED_TIC, equal_nan=True)
示例#3
0
def test_segment_ds(dump_mock):
    imzml_reader = make_imzml_reader_mock(list(product(
        [0], range(10))), (np.linspace(0, 90, num=10), np.ones(10)))
    ds_segments = np.array([[0, 50], [50, 90.0]])

    chunk_sp_n = 1000
    segment_ds(imzml_reader, chunk_sp_n, ds_segments, Path('/tmp/abc'))

    for segm_i, ((sp_chunk_df, f), _) in enumerate(dump_mock.call_args_list):
        min_mz, max_mz = ds_segments[segm_i]

        assert sp_chunk_df.shape == (50, 3)
        assert np.all(min_mz <= sp_chunk_df.mz)
        assert np.all(sp_chunk_df.mz <= max_mz)
示例#4
0
def _test_compute_metrics():
    ds_config = DSConfig(
        analysis_version=1,
        image_generation=DSConfigImageGeneration(ppm=3.0,
                                                 n_levels=30,
                                                 min_px=1,
                                                 compute_unused_metrics=False),
        # Unused fields
        database_ids=None,
        isotope_generation=None,
        fdr=None,
    )
    imzml_reader = make_imzml_reader_mock(list(product(range(2), range(3))))
    compute_metrics = make_compute_image_metrics(imzml_reader, ds_config)
    return compute_metrics
示例#5
0
def test_fetch_chunk_spectra_data():
    mz_n = 10
    imzml_reader = make_imzml_reader_mock(
        [(1, 1, 1), (2, 1, 1)], (np.linspace(0, 90, num=mz_n), np.ones(mz_n)))

    sp_chunk_df = fetch_chunk_spectra_data(sp_ids=[0, 1],
                                           imzml_reader=imzml_reader)

    exp_mzs, exp_ints = [
        np.sort([mz for mz in np.linspace(0, 90, num=mz_n) for _ in range(2)]),
        np.ones(2 * mz_n),
    ]

    assert sp_chunk_df.mz.dtype == 'f'
    assert_array_almost_equal(sp_chunk_df.mz, exp_mzs)
    assert_array_almost_equal(sp_chunk_df.int, exp_ints)
def test_imzml_reader_tic_image_from_metadata():
    imzml_reader = make_imzml_reader_mock(
        coordinates=MOCK_COORDINATES,
        spectra=MOCK_SPECTRA,
        spectrum_metadata_fields={TIC_ACCESSION: [0, 1, 2, 3]},
    )

    # metadata_tic intentionally matches the metadata but not the spectra
    metadata_tic = np.array(
        [
            [0, np.nan, 1],
            [np.nan, np.nan, np.nan],
            [2, np.nan, 3],
        ]
    )
    assert np.array_equal(imzml_reader.tic_image(), metadata_tic, equal_nan=True)
示例#7
0
def test_define_ds_segments():
    imzml_reader = make_imzml_reader_mock(mz_precision='d')

    mz_max = 100
    sample_mzs = np.linspace(0, mz_max, 100)
    ds_segm_size_mb = 800 / (
        2**20)  # 1600 b total data size / 2 segments, converted to MB
    ds_segments = define_ds_segments(sample_mzs,
                                     sample_ratio=1,
                                     imzml_reader=imzml_reader,
                                     ds_segm_size_mb=ds_segm_size_mb)

    exp_ds_segm_n = 8
    exp_bounds = [i * mz_max / exp_ds_segm_n for i in range(exp_ds_segm_n + 1)]
    exp_ds_segments = np.array(list(zip(exp_bounds[:-1], exp_bounds[1:])))
    assert ds_segments.shape == exp_ds_segments.shape
    assert np.allclose(ds_segments, exp_ds_segments)
def test_ambiguous_modifiers(
    fetch_formulas_mock, formula_image_metrics_mock, spark_context, ds_config
):
    with TemporaryDirectory() as tmpdir:
        ds_data_path = Path(tmpdir)
        print(ds_data_path)

        ds_config = {
            **ds_config,
            "isotope_generation": {
                **ds_config["isotope_generation"],
                # This set of modifiers are deliberately chosen so that ('','-H2O','+H') and ('-H2O+H','','') produce the same
                # modifier string, to test that no code accidentally relies on "modifier" or "ion" strings being unambiguous
                "chem_mods": ["-H2O+H"],
                "neutral_losses": ["-H2O"],
                "adducts": ["+H", "[M]+"],
            },
        }

        formulas = [
            'H3O',
            'H4O',
            'H5O2',
            'H6O2',
        ]  # Formulae selected to create isomers with the above modifiers
        fetch_formulas_mock.return_value = formulas
        msm_search = MSMSearch(
            spark_context,
            make_imzml_reader_mock(),
            [MolecularDB(0, 'test_db', 'version', targeted=True)],
            ds_config,
            ds_data_path,
            NullProfiler(),
        )
        msm_search._fetch_formula_centroids = make_fetch_formula_centroids_mock()
        msm_search.process_segments = lambda centr_segm_n, func: spark_context.parallelize(
            map(func, range(centr_segm_n))
        )
        formula_image_metrics_mock.side_effect = make_formula_image_metrics_mock_side_effect()

        moldb_ion_metrics_df, _, fdr_bundle = next(msm_search.search())
        assert (
            moldb_ion_metrics_df[['formula', 'chem_mod', 'neutral_loss', 'adduct']]
            .duplicated()
            .sum()
            == 0
        )
        # There are 4 combinations of modifiers to get H2: (H3O,-H2O+H,,), (H3O,,-H2O,+H), (H4O,,-H2O,), (H5O2,-H2O+H,-H2O,)
        assert len(moldb_ion_metrics_df[moldb_ion_metrics_df.ion_formula == 'H2']) == 4
        # Only 1 combination of modifiers can create H7O2: (H6O2,,,+H)
        assert len(moldb_ion_metrics_df[moldb_ion_metrics_df.ion_formula == 'H7O2']) == 1

        # H5O2 and H6O2 can have all combinations: 2 neutral loss options, 2 chem mods, 2 adducts = 8 possible combinations
        assert len(moldb_ion_metrics_df[moldb_ion_metrics_df.formula == 'H5O2']) == 8
        assert len(moldb_ion_metrics_df[moldb_ion_metrics_df.formula == 'H6O2']) == 8
        # H3O and H4O cannot simultaneously have -H2O and -H2O+H
        assert len(moldb_ion_metrics_df[moldb_ion_metrics_df.formula == 'H3O']) == 6
        assert len(moldb_ion_metrics_df[moldb_ion_metrics_df.formula == 'H4O']) == 6

        assert moldb_ion_metrics_df.formula.isin(formulas).all()

        sanity_check_fdr_diagnostics(fdr_bundle)