Exemplo n.º 1
0
def get_fdr_bundles(
    storage: Storage,
    formula_metrics_df: pd.DataFrame,
    db_data_cobjs: List[CObj[DbFDRData]],
    db_id_to_job_id: Dict[int, int],
) -> Dict[int, FdrDiagnosticBundle]:

    logger.debug(f'Making {len(db_data_cobjs)} FDR bundles')
    bundles: Dict[int, FdrDiagnosticBundle] = {}
    for db_data in iter_cobjs_with_prefetch(storage, db_data_cobjs):
        fdr = db_data['fdr']
        formula_map_df = (db_data['formula_map_df'].drop(
            columns=['target']).drop_duplicates(ignore_index=True))

        # Extract the metrics for just this database, avoiding duplicates and handling missing rows
        metrics_df = formula_metrics_df.rename_axis(index='formula_i').merge(
            formula_map_df[['formula_i'
                            ]].drop_duplicates().set_index('formula_i'),
            left_index=True,
            right_index=True,
        )
        job_id = db_id_to_job_id[db_data['id']]
        bundle = FdrDiagnosticBundle(
            decoy_sample_size=fdr.decoy_sample_size,
            decoy_map_df=fdr.td_df,
            formula_map_df=formula_map_df,
            metrics_df=metrics_df,
        )
        bundles[job_id] = bundle

    return bundles
Exemplo n.º 2
0
    def debug_get_annotation_data(self, formula, modifier):
        """Debugging tool for finding relevant data about a particular annotation, e.g. for
        investigating MSM or image generation issues"""
        # pylint: disable=possibly-unused-variable
        # Find formula_i(s)
        db_data_idxs = []
        db_datas = []
        formula_is = []
        for idx, db_data in enumerate(
                iter_cobjs_with_prefetch(self.storage, self.db_data_cobjs)):
            df = db_data['formula_map_df']
            df = df[(df.formula == formula) & (df.modifier == modifier)]
            if not df.empty:
                db_data_idxs.append(idx)
                db_datas.append(db_data)
                formula_is.extend(df.formula_i.tolist())

        # Find centroids
        peaks_df_idxs = []
        peaks_dfs = []
        peaks = []
        for idx, peaks_df in enumerate(
                iter_cobjs_with_prefetch(self.storage, self.peaks_cobjs)):
            df = peaks_df[peaks_df.index.isin(formula_is)]
            if not df.empty:
                peaks_df_idxs.append(idx)
                peaks_dfs.append(peaks_df)
                peaks.append(df)
        peaks = pd.concat(peaks) if len(peaks) > 0 else None

        # Find MSM
        metrics = self.formula_metrics_df[self.formula_metrics_df.index.isin(
            formula_is)]

        del idx, df

        return locals()
Exemplo n.º 3
0
def get_formulas_df(
        storage: Storage, ds_config: DSConfig, moldbs: List[InputMolDb]
) -> Tuple[List[CObj[DbFDRData]], pd.DataFrame]:
    # Load databases
    moldb_cobjects = [cast(CObj, moldb['cobj']) for moldb in moldbs]
    dbs_iter = iter_cobjs_with_prefetch(storage, moldb_cobjects)

    # Calculate formulas
    db_datas: List[DbFDRData] = []
    ion_formula = set()
    target_ion_formulas = set()
    targeted_ion_formulas = set()
    with ProcessPoolExecutor() as executor:
        for moldb, (fdr, formula_map_df) in zip(
                moldbs,
                executor.map(_get_db_fdr_and_formulas, repeat(ds_config),
                             dbs_iter)):
            db_datas.append({
                **moldb,  # type: ignore # https://github.com/python/mypy/issues/4122
                'fdr':
                fdr,
                'formula_map_df':
                formula_map_df,
            })
            ion_formula.update(formula_map_df.ion_formula)
            target_ion_formulas.update(
                formula_map_df.ion_formula[formula_map_df.target])
            if moldb.get('targeted'):
                targeted_ion_formulas.update(formula_map_df.ion_formula)

    formulas_df = pd.DataFrame({
        'ion_formula': sorted(ion_formula)
    }).rename_axis(index='formula_i')
    formulas_df['target'] = formulas_df.ion_formula.isin(target_ion_formulas)
    formulas_df['targeted'] = formulas_df.ion_formula.isin(
        targeted_ion_formulas)
    # Replace ion_formula column with formula_i
    formula_to_id = pd.Series(formulas_df.index, formulas_df.ion_formula)
    for db_data in db_datas:
        db_data['formula_map_df']['formula_i'] = formula_to_id[
            db_data['formula_map_df'].ion_formula].values
        del db_data['formula_map_df']['ion_formula']

    db_data_cobjs = save_cobjs(storage, db_datas)

    return db_data_cobjs, formulas_df
Exemplo n.º 4
0
    def save_png_chunk(df: pd.DataFrame, *, storage: Storage):
        pngs = []
        groups = defaultdict(lambda: [])
        for formula_i, cobj in df.cobj.items():
            groups[cobj].append(formula_i)

        image_dict_iter = iter_cobjs_with_prefetch(storage,
                                                   list(groups.keys()))
        for image_dict, formula_is in zip(image_dict_iter, groups.values()):
            for formula_i in formula_is:
                formula_pngs = [
                    png_generator.generate_png(img.toarray())
                    if img is not None else None
                    for img in image_dict[formula_i]
                ]
                pngs.append((formula_i, formula_pngs))
        return save_cobj(storage, pngs)
Exemplo n.º 5
0
    def run(self, save=True, **kwargs):
        results_dfs, png_cobjs = self.pipe.run_pipeline(**kwargs)
        if save:
            for moldb_id, results_df in results_dfs.items():
                results_df.to_csv(self.out_dir / f'results_{moldb_id}.csv')
            all_results = pd.concat(list(results_dfs.values()))
            all_results = all_results[~all_results.index.duplicated()]
            image_names = (all_results.formula +
                           all_results.chem_mod.fillna('') +
                           all_results.neutral_loss.fillna('') +
                           all_results.adduct)

            self.out_dir.mkdir(exist_ok=True)
            for imageset in iter_cobjs_with_prefetch(self.storage, png_cobjs):
                for formula_i, imgs in imageset:
                    for i, img in enumerate(imgs, 1):
                        if img:
                            out_file = self.out_dir / f'{image_names[formula_i]}_{i}.png'
                            out_file.open('wb').write(img)
Exemplo n.º 6
0
    def run_coloc_job_lithops(self, fexec: Executor, ds: Dataset, reprocess: bool = False):
        # Extract required fields to avoid pickling Dataset, because unpickling Dataset tries to
        # import psycopg2 and fails inside Functions
        ds_id = ds.id
        sm_config = self._sm_config

        def run_coloc_job(moldb_id, image_ids, ion_ids, fdrs, *, storage):
            # Use web_app_url to get the publicly-exposed storage server address, because
            # Functions can't use the private address
            images, h, w = _get_images(ImageStorage(sm_config), ds_id, image_ids)
            cobjs = []
            for job in analyze_colocalization(ds_id, moldb_id, images, ion_ids, fdrs, h, w):
                cobjs.append(save_cobj(storage, job))
            return cobjs

        tasks = list(self._iter_pending_coloc_tasks(ds.id, reprocess))
        cost_factors = pd.DataFrame({'n_images': [len(task[1]) for task in tasks]})
        job_cobjs = fexec.map_concat(
            run_coloc_job, tasks, cost_factors=cost_factors, runtime_memory=4096
        )

        for job in iter_cobjs_with_prefetch(fexec.storage, job_cobjs):
            self._save_job_to_db(job)