Exemplo n.º 1
0
def test_centroids_subset_selection_works(pyspark_context, sm_config,
                                          ds_config):
    isocalc = IsocalcWrapper(ds_config['isotope_generation'])
    centr_gen = IonCentroidsGenerator(sc=pyspark_context,
                                      moldb_name='HMDB',
                                      isocalc=isocalc)
    centr_gen.ion_df = pd.DataFrame({
        'ion_i': [101, 102],
        'sf': ['H2O', 'Au'],
        'adduct': ['+H', '-H']
    }).set_index('ion_i')
    centr_gen.ion_centroids_df = pd.DataFrame({
        'ion_i': [101, 102, 101, 102],
        'peak_i': [0, 0, 1, 1],
        'mz': [100., 300., 200., 400.],
        'int': [100., 10., 100., 1.]
    }).set_index('ion_i')

    centr_subset = centr_gen.centroids_subset(ions=[('H2O', '+H')])

    assert centr_subset.index.tolist() == [101, 101]
    assert centr_subset.to_dict(orient='list') == {
        'peak_i': [0, 1],
        'mz': [100., 200.],
        'int': [100., 100.]
    }
Exemplo n.º 2
0
    def _run_annotation_job(self, mol_db):
        try:
            self.store_job_meta(mol_db.id)
            mol_db.set_job_id(self._job_id)

            logger.info(
                "Running new job ds_id: %s, ds_name: %s, db_name: %s, db_version: %s",
                self._ds.id, self._ds.name, mol_db.name, mol_db.version)

            target_adducts = self._ds.config['isotope_generation']['adducts']
            self._fdr = FDR(job_id=self._job_id,
                            decoy_sample_size=20,
                            target_adducts=target_adducts,
                            db=self._db)

            isocalc = IsocalcWrapper(self._ds.config['isotope_generation'])
            centroids_gen = IonCentroidsGenerator(sc=self._sc,
                                                  moldb_name=mol_db.name,
                                                  isocalc=isocalc)
            polarity = self._ds.config['isotope_generation']['charge'][
                'polarity']
            all_adducts = list(
                set(self._sm_config['defaults']['adducts'][polarity])
                | set(DECOY_ADDUCTS))
            centroids_gen.generate_if_not_exist(isocalc=isocalc,
                                                sfs=mol_db.sfs,
                                                adducts=all_adducts)
            target_ions = centroids_gen.ions(target_adducts)
            self._fdr.decoy_adducts_selection(target_ions)

            search_alg = MSMBasicSearch(sc=self._sc,
                                        ds=self._ds,
                                        ds_reader=self._ds_reader,
                                        mol_db=mol_db,
                                        centr_gen=centroids_gen,
                                        fdr=self._fdr,
                                        ds_config=self._ds.config)
            ion_metrics_df, ion_iso_images = search_alg.search()

            search_results = SearchResults(mol_db.id, self._job_id,
                                           search_alg.metrics.keys())
            mask = self._ds_reader.get_2d_sample_area_mask()
            img_store_type = self._ds.get_ion_img_storage_type(self._db)
            search_results.store(ion_metrics_df, ion_iso_images, mask,
                                 self._db, self._img_store, img_store_type)
        except Exception as e:
            self._db.alter(
                JOB_UPD_STATUS_FINISH,
                params=(JobStatus.FAILED,
                        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        self._job_id))
            msg = 'Job failed(ds_id={}, mol_db={}): {}'.format(
                self._ds.id, mol_db, str(e))
            raise JobFailedError(msg) from e
        else:
            self._db.alter(
                JOB_UPD_STATUS_FINISH,
                params=(JobStatus.FINISHED,
                        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        self._job_id))
Exemplo n.º 3
0
def test_isocalc_wrapper_get_iso_peaks_wrong_sf_adduct(isocalc_isodist, ds_config):

    def get_spectrum_side_effect(source):
        if source == 'centroids':
            return np.array([100., 200.]), np.array([100., 10.])
        elif source == 'profile':
            return (np.array([0, 90., 95., 99., 100., 101., 105., 110.,
                              190., 195., 199., 200., 201., 205., 210., 100500.]),
                    np.array([0, 10., 50., 90., 100., 90., 50., 10.,
                              1., 5., 9., 10., 9., 5., 1., 100500.]))

    mock_mass_sp = MagicMock(spec=MassSpectrum)
    mock_mass_sp.get_spectrum.side_effect = get_spectrum_side_effect
    isocalc_isodist.return_value = mock_mass_sp

    isocalc_wrapper = IsocalcWrapper(ds_config['isotope_generation'])
    emtpy_iso_dict = Centroids([], [])
    assert isocalc_wrapper.isotope_peaks(None, '+H') == emtpy_iso_dict
    assert isocalc_wrapper.isotope_peaks('Au', None) == emtpy_iso_dict
Exemplo n.º 4
0
def test_generate_returns_valid_df(pyspark_context, sm_config, ds_config):
    isocalc = IsocalcWrapper(ds_config['isotope_generation'])
    centroids_gen = IonCentroidsGenerator(sc=pyspark_context, moldb_name='HMDB', isocalc=isocalc)
    centroids_gen._iso_gen_part_n = 1
    centroids_gen.generate(isocalc=isocalc, sfs=['C2H4O8', 'C3H6O7', 'fake_mf'], adducts=['+Na'])

    assert centroids_gen.ion_centroids_df.shape == (8, 3)
    assert np.all(np.diff(centroids_gen.ion_centroids_df.mz.values) >= 0)  # assert that dataframe is sorted by mz

    assert centroids_gen.ion_df.shape == (2, 2)
Exemplo n.º 5
0
def test_centroids_subset_ordered_by_mz(pyspark_context, sm_config, ds_config):
    isocalc = IsocalcWrapper(ds_config['isotope_generation'])
    centr_gen = IonCentroidsGenerator(sc=pyspark_context, moldb_name='HMDB', isocalc=isocalc)
    centr_gen._iso_gen_part_n = 1
    centr_gen.generate(isocalc=isocalc,
                       sfs=['C2H4O8', 'C3H6O7', 'C59H112O6', 'C62H108O'],
                       adducts=['+Na', '+H', '+K'])

    ion_centroids = centr_gen.centroids_subset([('C59H112O6', '+H'), ('C62H108O', '+Na')])
    assert ion_centroids.shape == (8, 3)
    assert np.all(np.diff(ion_centroids.mz.values) >= 0)  # assert that dataframe is sorted by mz
Exemplo n.º 6
0
    def __init__(self, sc, sm_config, ds_config):
        self.sc = sc
        self.sm_config = sm_config
        self.ds_config = ds_config

        self.theor_peaks_tmp_dir = join(sm_config['fs']['base_path'],
                                        'tmp_theor_peaks_gen')
        self.db = DB(sm_config['db'])

        self.adducts = self.ds_config['isotope_generation']['adducts']

        self.isocalc_wrapper = IsocalcWrapper(
            self.ds_config['isotope_generation'])
Exemplo n.º 7
0
    def index(self, ds):
        """ Reindex all dataset results """
        self.es.delete_ds(ds.id)

        for job_id, mol_db_name in self._finished_job_moldbs(ds.id):
            if mol_db_name not in ds.mol_dbs:
                self._db.alter('DELETE FROM job WHERE id = %s',
                               params=(job_id, ))
            else:
                mol_db = MolecularDB(
                    name=mol_db_name,
                    iso_gen_config=ds.config['isotope_generation'])
                isocalc = IsocalcWrapper(ds.config['isotope_generation'])
                self.es.index_ds(ds_id=ds.id, mol_db=mol_db, isocalc=isocalc)
Exemplo n.º 8
0
def test_isocalc_wrapper_get_iso_peaks_wrong_sf_adduct(isocalc_isodist,
                                                       ds_config):
    def get_spectrum_side_effect(source):
        if source == 'centroids':
            return np.array([100., 200.]), np.array([100., 10.])
        elif source == 'profile':
            return (np.array([
                0, 90., 95., 99., 100., 101., 105., 110., 190., 195., 199.,
                200., 201., 205., 210., 100500.
            ]),
                    np.array([
                        0, 10., 50., 90., 100., 90., 50., 10., 1., 5., 9., 10.,
                        9., 5., 1., 100500.
                    ]))

    mock_mass_sp = MagicMock(spec=MassSpectrum)
    mock_mass_sp.get_spectrum.side_effect = get_spectrum_side_effect
    isocalc_isodist.return_value = mock_mass_sp

    isocalc_wrapper = IsocalcWrapper(ds_config['isotope_generation'])
    emtpy_iso_dict = Centroids([], [])
    assert isocalc_wrapper.isotope_peaks(None, '+H') == emtpy_iso_dict
    assert isocalc_wrapper.isotope_peaks('Au', None) == emtpy_iso_dict
Exemplo n.º 9
0
def _reindex_datasets(rows, es_exp):
    logger.info('Reindexing %s dataset(s)', len(rows))
    for ds_id, ds_name, ds_config in rows:
        try:
            es_exp.delete_ds(ds_id)
            for mol_db_dict in ds_config['databases']:
                mol_db = MolecularDB(
                    name=mol_db_dict['name'],
                    iso_gen_config=ds_config['isotope_generation'])
                isocalc = IsocalcWrapper(ds_config['isotope_generation'])
                es_exp.index_ds(ds_id, mol_db=mol_db, isocalc=isocalc)
        except Exception as e:
            new_msg = 'Failed to reindex(ds_id={}, ds_name={}): {}'.format(
                ds_id, ds_name, e)
            logger.error(new_msg)
Exemplo n.º 10
0
def test_save_restore_works(pyspark_context, sm_config, ds_config):
    isocalc = IsocalcWrapper(ds_config['isotope_generation'])
    centr_gen = IonCentroidsGenerator(sc=pyspark_context, moldb_name='HMDB', isocalc=isocalc)

    centr_gen.ion_centroids_df = pd.DataFrame({'ion_i': [101, 101, 102, 102],
                                               'peak_i': [0, 1, 0, 1],
                                               'mz': [100., 200., 300., 400.],
                                               'int': [100., 10., 100., 1.]}).set_index('ion_i')
    centr_gen.ion_df = pd.DataFrame({'ion_i': [101, 101, 102, 102],
                                     'sf': ['H2O', 'H2O', 'Au', 'Au'],
                                     'adduct': ['+H', '-H', '+H', '-H']}).set_index('ion_i')
    centr_gen.save()
    centr_gen.restore()

    df = centr_gen.centroids_subset(ions=[('H2O', '-H')])
    assert df.index.unique().tolist() == [101]
Exemplo n.º 11
0
    def update(self, ds, **kwargs):
        """ Reindex all dataset results """
        ds.set_status(self._db, self._es, self._status_queue,
                      DatasetStatus.INDEXING)

        self._es.delete_ds(ds.id)

        moldb_names = [d['name'] for d in ds.config['databases']]
        for job_id, mol_db_name in self._finished_job_moldbs(ds.id):
            if mol_db_name not in moldb_names:
                self._db.alter('DELETE FROM job WHERE id = %s',
                               params=(job_id, ))
            else:
                mol_db = MolecularDB(
                    name=mol_db_name,
                    iso_gen_config=ds.config['isotope_generation'])
                isocalc = IsocalcWrapper(ds.config['isotope_generation'])
                self._es.index_ds(ds_id=ds.id, mol_db=mol_db, isocalc=isocalc)

        ds.set_status(self._db, self._es, self._status_queue,
                      DatasetStatus.FINISHED)
def test_isotopic_pattern_has_n_peaks(ds_config):
    isocalc_wrapper = IsocalcWrapper(ds_config['isotope_generation'])
    mzs, ints = isocalc_wrapper.ion_centroids('C8H20NO6P', '+K')

    assert len(mzs) == ISOTOPIC_PEAK_N
    assert len(ints) == ISOTOPIC_PEAK_N
def test_isotopic_pattern_h20(ds_config):
    isocalc_wrapper = IsocalcWrapper(ds_config['isotope_generation'])
    mzs, ints = isocalc_wrapper.ion_centroids('H2O', '+H')

    assert_array_almost_equal(mzs, np.array([19.018,  20.023,  21.023]), decimal=3)
    assert_array_almost_equal(ints, np.array([100.,   0.072,   0.205]), decimal=2)
def test_isocalc_wrapper_get_iso_peaks_wrong_sf_adduct(ds_config, sf, adduct):
    isocalc_wrapper = IsocalcWrapper(ds_config['isotope_generation'])
    mzs, ints = isocalc_wrapper.ion_centroids(sf, adduct)
    assert mzs is None, ints is None