Exemplo n.º 1
0
def reindex_results(sm_config, ds_id, ds_mask, use_inactive_index,
                    offline_reindex, update_fields):
    assert ds_id or ds_mask or offline_reindex

    IsocalcWrapper.set_centroids_cache_enabled(True)

    if offline_reindex:
        _reindex_all(sm_config)
    else:
        es_config = sm_config['elasticsearch']
        if use_inactive_index:
            es_config = get_inactive_index_es_config(es_config)

        db = DB()
        es_exp = ESExporter(db,
                            sm_config={
                                **sm_config, 'elasticsearch': es_config
                            })

        if ds_id:
            ds_ids = ds_id.split(',')
        elif ds_mask:
            ds_ids = [
                id for (id, ) in db.select(
                    "select id from dataset where name like '{}%'".format(
                        ds_mask))
            ]
        else:
            ds_ids = []

        if update_fields:
            _partial_update_datasets(ds_ids, es_exp, update_fields.split(','))
        else:
            _reindex_datasets(ds_ids, es_exp)
Exemplo n.º 2
0
def test_centroids_h20(ds_config):
    isocalc_wrapper = IsocalcWrapper(ds_config)
    mzs, ints = isocalc_wrapper.centroids('H2O+H')

    assert_array_almost_equal(mzs,
                              np.array([19.018, 20.022, 20.024, 21.022]),
                              decimal=3)
    assert_array_almost_equal(ints,
                              np.array([1.00e02, 3.83e-02, 3.48e-02,
                                        2.06e-01]),
                              decimal=2)
Exemplo n.º 3
0
def test_save_restore_works(spark_context, ds_config, clean_isotope_storage_path):
    ion_centroids = FormulaCentroids(
        formulas_df=pd.DataFrame(
            {'formula_i': [101, 101, 102, 102], 'formula': ['H2O', 'H2O', 'Au', 'Au']}
        ).set_index('formula_i'),
        centroids_df=pd.DataFrame(
            {
                'formula_i': [101, 101, 102, 102],
                'peak_i': [0, 1, 0, 1],
                'mz': [100.0, 200.0, 300.0, 400.0],
                'int': [100.0, 10.0, 100.0, 1.0],
            }
        ).set_index('formula_i'),
    )

    isocalc = IsocalcWrapper(ds_config)
    centr_gen = CentroidsGenerator(sc=spark_context, isocalc=isocalc)
    centr_gen._save(ion_centroids)
    formula_centroids_restored = centr_gen._restore()

    from pandas.testing import assert_frame_equal

    assert_frame_equal(
        ion_centroids.formulas_df.sort_index(), formula_centroids_restored.formulas_df.sort_index()
    )
    assert_frame_equal(
        ion_centroids.centroids_df().sort_index(),
        formula_centroids_restored.centroids_df().sort_index(),
    )
Exemplo n.º 4
0
    def __init__(
        self,
        imzml_cobject: CloudObject,
        ibd_cobject: CloudObject,
        moldbs: List[InputMolDb],
        ds_config: DSConfig,
        executor: Executor = None,
        lithops_config=None,
        cache_key=None,
        use_db_cache=True,
        use_db_mutex=True,
    ):
        lithops_config = lithops_config or SMConfig.get_conf()['lithops']
        self.lithops_config = lithops_config
        self._db = DB()
        self.imzml_cobject = imzml_cobject
        self.ibd_cobject = ibd_cobject
        self.moldbs = moldbs
        self.ds_config = ds_config
        self.isocalc_wrapper = IsocalcWrapper(ds_config)

        self.executor = executor or Executor(lithops_config)
        self.storage = self.executor.storage

        if cache_key is not None:
            self.cacher: Optional[PipelineCacher] = PipelineCacher(
                self.storage, cache_key, lithops_config)
        else:
            self.cacher = None

        self.use_db_cache = use_db_cache
        self.use_db_mutex = use_db_mutex
        self.ds_segm_size_mb = 128
Exemplo n.º 5
0
 def _fetch_formula_centroids(self, ion_formula_map_df):
     """Generate/load centroids for all ions formulas"""
     logger.info('Fetching formula centroids')
     isocalc = IsocalcWrapper(self._ds_config)
     centroids_gen = CentroidsGenerator(sc=self._spark_context,
                                        isocalc=isocalc)
     ion_formulas = np.unique(ion_formula_map_df.ion_formula.values)
     formula_centroids = centroids_gen.generate_if_not_exist(
         formulas=ion_formulas.tolist())
     logger.debug(
         f'Formula centroids df size: {formula_centroids.centroids_df().shape}'
     )
     return formula_centroids
Exemplo n.º 6
0
def test_centroids_subset_ordered_by_mz(spark_context, ds_config, clean_isotope_storage_path):
    isocalc = IsocalcWrapper(ds_config)
    centr_gen = CentroidsGenerator(sc=spark_context, isocalc=isocalc)
    centr_gen._iso_gen_part_n = 1
    formulas = [
        generate_ion_formula(f, a)
        for f, a in product(['C2H4O8', 'C3H6O7', 'C59H112O6', 'C62H108O'], ['+Na', '+H', '[M]+'])
    ]
    formula_centroids = centr_gen.generate_if_not_exist(formulas)

    assert formula_centroids.centroids_df(True).shape == (4 * 3 * 4, 3)
    assert np.all(
        np.diff(formula_centroids.centroids_df().mz.values) >= 0
    )  # assert that dataframe is sorted by mz
Exemplo n.º 7
0
def test_if_not_exist_returns_valid_df(spark_context, ds_config, clean_isotope_storage_path):
    isocalc = IsocalcWrapper(ds_config)
    centroids_gen = CentroidsGenerator(sc=spark_context, isocalc=isocalc)
    centroids_gen._iso_gen_part_n = 1

    ion_centroids = centroids_gen.generate_if_not_exist(
        formulas=['C2H4O8Na', 'C3H6O7Na', 'fake_mfNa']
    )

    assert ion_centroids.centroids_df(True).shape == (2 * 4, 3)
    assert np.all(
        np.diff(ion_centroids.centroids_df().mz.values) >= 0
    )  # assert that dataframe is sorted by mz
    assert ion_centroids.formulas_df.shape == (2, 1)
Exemplo n.º 8
0
def make_mock_spectrum(ds_config):
    isocalc_wrapper = IsocalcWrapper(ds_config)
    formulas = [
        *MOCK_FORMULAS[:2],
        # Insert decoys after the first 2 formulas, so that the calculated FDRs are predictable:
        # first 10 formulas = 0/2 (quantized to 5% FDR)
        # remaining 90 formulas = 3/10 (quantized to 50% FDR)
        *(formula + decoy
          for formula, decoy in product(MOCK_FORMULAS[:3], fdr.DECOY_ADDUCTS)),
        *MOCK_FORMULAS[2:],
    ]
    mzs = []
    ints = []
    for i, formula in enumerate(formulas):
        formula_mzs, formula_ints = isocalc_wrapper.centroids(formula)
        mzs.extend(formula_mzs)
        # Reduce MSM based on the order in `formulas` by sabotaging spectral correlation
        sabotage = (1 - i / len(formulas))**np.arange(len(formula_ints))
        ints.extend(formula_ints * sabotage)

    mzs = np.array(mzs)
    ints = np.array(ints)
    order = np.argsort(mzs)
    return mzs[order], ints[order]
Exemplo n.º 9
0
def create_process_segment(
    ds_segments: List,
    imzml_reader: ImzMLReader,
    ds_config: DSConfig,
    target_formula_inds: Set[int],
    targeted_database_formula_inds: Set[int],
):
    compute_metrics = make_compute_image_metrics(imzml_reader, ds_config)
    isocalc = IsocalcWrapper(ds_config)
    ppm = ds_config['image_generation']['ppm']
    min_px = ds_config['image_generation']['min_px']
    n_peaks = ds_config['isotope_generation']['n_peaks']
    compute_unused_metrics = ds_config['image_generation'].get('compute_unused_metrics')
    nrows, ncols = imzml_reader.h, imzml_reader.w

    def process_centr_segment(segm_i):
        centr_segm_path = get_file_path(f'centr_segm_{segm_i:04}.pickle')

        formula_metrics_df, formula_images = pd.DataFrame(), {}
        if centr_segm_path.exists():
            logger.info(f'Reading centroids segment {segm_i} from {centr_segm_path}')

            centr_df = read_centroids_segment(centr_segm_path)
            first_ds_segm_i, last_ds_segm_i = choose_ds_segments(ds_segments, centr_df, ppm)

            logger.info(f'Reading dataset segments {first_ds_segm_i}-{last_ds_segm_i}')

            ds_segm_it = read_ds_segments(first_ds_segm_i, last_ds_segm_i)
            formula_images_it = gen_iso_images(
                ds_segm_it, centr_df=centr_df, nrows=nrows, ncols=ncols, isocalc=isocalc
            )
            formula_metrics_df, formula_images = formula_image_metrics(
                formula_images_it,
                compute_metrics,
                target_formula_inds=target_formula_inds,
                targeted_database_formula_inds=targeted_database_formula_inds,
                n_peaks=n_peaks,
                min_px=min_px,
                compute_unused_metrics=compute_unused_metrics,
            )
            logger.info(f'Segment {segm_i} finished')
        else:
            logger.warning(f'Centroids segment path not found {centr_segm_path}')

        return formula_metrics_df, formula_images

    return process_centr_segment
Exemplo n.º 10
0
def get_moldb_centroids(
    executor: Executor,
    sm_storage: Dict,
    ds_config: DSConfig,
    moldbs: List[InputMolDb],
    debug_validate=False,
    use_cache=True,
    use_db_mutex=True,
):
    moldb_cache = CentroidsCacheEntry(executor, sm_storage, ds_config, moldbs)

    with ExitStack() as stack:
        if use_db_mutex:
            stack.enter_context(moldb_cache.lock())

        if use_cache:
            cached_val = moldb_cache.load()
        else:
            cached_val = None
            moldb_cache.clear()

        if cached_val:
            db_data_cobjs, peaks_cobjs = cached_val
            logger.info(
                f'Loaded {len(db_data_cobjs)} DBs, {len(peaks_cobjs)} peak segms from cache'
            )
        else:
            formula_cobjs, db_data_cobjs = build_moldb(executor, ds_config,
                                                       moldbs)
            isocalc_wrapper = IsocalcWrapper(ds_config)
            peaks_cobjs = calculate_centroids(executor, formula_cobjs,
                                              isocalc_wrapper)
            if debug_validate:
                validate_centroids(executor, peaks_cobjs)

            moldb_cache.save(db_data_cobjs, peaks_cobjs)
            logger.info(
                f'Saved {len(db_data_cobjs)} DBs, {len(peaks_cobjs)} peak segms to cache'
            )

    return db_data_cobjs, peaks_cobjs
Exemplo n.º 11
0
    def index(self, ds: Dataset):
        """Re-index all search results for the dataset.

        Args:
            ds: dataset to index
        """
        self._es.delete_ds(ds.id, delete_dataset=False)

        job_docs = self._db.select_with_fields(
            'SELECT id, moldb_id FROM job WHERE ds_id = %s', params=(ds.id,)
        )
        moldb_ids = ds.config['database_ids']
        for job_doc in job_docs:
            moldb = molecular_db.find_by_id(job_doc['moldb_id'])
            if job_doc['moldb_id'] not in moldb_ids:
                self._db.alter('DELETE FROM job WHERE id = %s', params=(job_doc['id'],))
            else:
                isocalc = IsocalcWrapper(ds.config)
                self._es.index_ds(ds_id=ds.id, moldb=moldb, isocalc=isocalc)

        ds.set_status(self._db, self._es, DatasetStatus.FINISHED)
Exemplo n.º 12
0
    def reindex_ds(self, ds_id: str):
        """Delete and index dataset documents for all moldbs defined in the dataset config.

        Args:
            ds_id: dataset id
        """
        self.delete_ds(ds_id)

        ds_doc = DB().select_one_with_fields(
            "SELECT name, config FROM dataset WHERE id = %s", params=(ds_id, ))
        if ds_doc:
            isocalc = IsocalcWrapper(ds_doc['config'])
            for moldb_id in ds_doc['config']['database_ids']:
                moldb = molecular_db.find_by_id(moldb_id)
                try:
                    self.index_ds(ds_id, moldb=moldb, isocalc=isocalc)
                except Exception as e:
                    new_msg = (
                        f'Failed to reindex(ds_id={ds_id}, ds_name={ds_doc["name"]}, '
                        f'moldb: {moldb}): {e}')
                    logger.error(new_msg, exc_info=True)
        else:
            logger.warning(f'Dataset does not exist(ds_id={ds_id})')
Exemplo n.º 13
0
def test_centroids_number(ds_config, formula, adduct):
    isocalc_wrapper = IsocalcWrapper(ds_config)
    mzs, ints = isocalc_wrapper.centroids(formula + adduct)

    assert mzs is not None and ints is not None
    assert len(mzs) == len(ints) == 4
Exemplo n.º 14
0
def test_centroids_wrong_formula_adduct(ds_config, formula, adduct):
    isocalc_wrapper = IsocalcWrapper(ds_config)
    mzs, ints = isocalc_wrapper.centroids(formula + adduct)
    assert mzs is None, ints is None
Exemplo n.º 15
0
def test_add_isobar_fields_to_anns(ds_config):
    ann_docs = [
        {
            'annotation_id': 'Base annotation',
            'centroid_mzs': [100, 101, 102, 103],
            'iso_image_urls': ['img1', 'img2', 'img3', 'img4'],
            'msm': 0.5,
            'ion': 'H1+',
            'ion_formula': 'H1',
        },
        {
            'annotation_id': "Base's 1st centroid overlaps 1st",
            'centroid_mzs': [100.0002, 101.1, 102.1, 103.1],
            'iso_image_urls': ['img1', 'img2', 'img3', 'img4'],
            'msm': 0.6,
            'ion': 'H2+',
            'ion_formula': 'H2',
        },
        {
            'annotation_id':
            "Base's 1st centroid overlaps 2nd (shouldn't be reported)",
            'centroid_mzs': [98, 100.0002, 101.2, 102.2],
            'iso_image_urls': ['img1', 'img2', 'img3', 'img4'],
            'msm': 0.7,
            'ion': 'H3+',
            'ion_formula': 'H3',
        },
        {
            'annotation_id': "Base's 2nd and 3rd centroid overlap 3rd and 4th",
            'centroid_mzs': [96, 97, 101, 102],
            'iso_image_urls': ['img1', 'img2', 'img3', 'img4'],
            'msm': 0.8,
            'ion': 'H4+',
            'ion_formula': 'H4',
        },
    ]
    isocalc = IsocalcWrapper(ds_config)

    ESExporterIsobars.add_isobar_fields_to_anns(ann_docs, isocalc)

    isobar_fields = dict((i, doc['isobars']) for i, doc in enumerate(ann_docs))
    assert isobar_fields == {
        0: [
            {
                'ion': 'H2+',
                'ion_formula': 'H2',
                'msm': 0.6,
                'peak_ns': [(1, 1)]
            },
            {
                'ion': 'H4+',
                'ion_formula': 'H4',
                'msm': 0.8,
                'peak_ns': [(2, 3), (3, 4)]
            },
        ],
        1: [{
            'ion': 'H1+',
            'ion_formula': 'H1',
            'msm': 0.5,
            'peak_ns': [(1, 1)]
        }],
        2: [],
        3: [{
            'ion': 'H1+',
            'ion_formula': 'H1',
            'msm': 0.5,
            'peak_ns': [(3, 2), (4, 3)]
        }],
    }