Пример #1
0
    def build_ranking(group_i, ranking_i, database, modifier, adduct, id,
                      storage):
        print("Building ranking...")
        print(f'job_i: {id}')
        print(f'ranking_i: {ranking_i}')
        print(f'database: {database}')
        print(f'modifier: {modifier}')
        print(f'adduct: {adduct}')
        # For every unmodified formula in `database`, look up the MSM score for the molecule
        # that it would become after the modifier and adduct are applied
        mols = read_cloud_object_with_retry(storage,
                                            mol_db_path_to_cobj[database],
                                            deserialise)
        if adduct is not None:
            # Target rankings use the same adduct for all molecules
            mol_formulas = list(
                map(safe_generate_ion_formula, mols, repeat(modifier),
                    repeat(adduct)))
        else:
            # Decoy rankings use a consistent random adduct for each molecule, chosen so that it doesn't overlap
            # with other decoy rankings for this molecule
            adducts = _get_random_adduct_set(len(mols), decoy_adducts,
                                             ranking_i)
            mol_formulas = list(
                map(safe_generate_ion_formula, mols, repeat(modifier),
                    adducts))

        formula_to_id = {}
        for cobject in formula_to_id_cobjects:
            formula_to_id_chunk = read_cloud_object_with_retry(
                storage, cobject, deserialise)

            for formula in mol_formulas:
                if formula_to_id_chunk.get(formula) is not None:
                    formula_to_id[formula] = formula_to_id_chunk.get(formula)

        formula_is = [
            formula and formula_to_id.get(formula) for formula in mol_formulas
        ]
        msm = [
            formula_i and msm_lookup.get(formula_i) for formula_i in formula_is
        ]
        if adduct is not None:
            ranking_df = pd.DataFrame({
                'mol': mols,
                'msm': msm
            },
                                      index=formula_is)
            ranking_df = ranking_df[~ranking_df.msm.isna()]
        else:
            # Specific molecules don't matter in the decoy rankings, only their msm distribution
            ranking_df = pd.DataFrame({'msm': msm})
            ranking_df = ranking_df[~ranking_df.msm.isna()]

        return id, storage.put_cloudobject(serialise(ranking_df))
Пример #2
0
 def get_first_peak_mz(cobject, id, storage):
     print(
         f'Extracting first peak mz values from clipped centroids dataframe {id}'
     )
     centr_df = read_cloud_object_with_retry(storage, cobject, deserialise)
     first_peak_df = centr_df[centr_df.peak_i == 0]
     return first_peak_df.mz.values
    def upload_chunk(ch_i, storage):
        chunk_sp_inds = chunks[ch_i]
        # Get imzml_reader from COS because it's too big to include via pywren captured vars
        imzml_reader = pickle.loads(
            read_cloud_object_with_retry(storage, imzml_cobject))
        n_spectra = sum(imzml_reader.mzLengths[sp_i] for sp_i in chunk_sp_inds)
        sp_mz_int_buf = np.zeros((n_spectra, 3),
                                 dtype=imzml_reader.mzPrecision)

        chunk_start = 0
        for sp_i, mzs, ints in get_spectra(ibd_path, imzml_reader,
                                           chunk_sp_inds):
            chunk_end = chunk_start + len(mzs)
            sp_mz_int_buf[chunk_start:chunk_end, 0] = sp_id_to_idx[sp_i]
            sp_mz_int_buf[chunk_start:chunk_end, 1] = mzs
            sp_mz_int_buf[chunk_start:chunk_end, 2] = ints
            chunk_start = chunk_end

        by_mz = np.argsort(sp_mz_int_buf[:, 1])
        sp_mz_int_buf = sp_mz_int_buf[by_mz]
        del by_mz

        chunk = msgpack.dumps(sp_mz_int_buf)
        size = sys.getsizeof(chunk) * (1 / 1024**2)
        logger.info(f'Uploading spectra chunk {ch_i} - %.2f MB' % size)
        chunk_cobject = storage.put_cobject(chunk)
        logger.info(f'Spectra chunk {ch_i} finished')
        return chunk_cobject
Пример #4
0
 def read_ds_segment(cobject):
     data = read_cloud_object_with_retry(storage, cobject, msgpack.load)
     if type(data) == list:
         sp_arr = np.concatenate(data)
     else:
         sp_arr = data
     return sp_arr
Пример #5
0
 def get_target_images(images_cobject, storage):
     images = {}
     segm_images = pickle.loads(
         read_cloud_object_with_retry(storage, images_cobject))
     for k, v in segm_images.items():
         if k in targets:
             images[k] = v
     return images
Пример #6
0
    def deduplicate_formulas_chunk(chunk_i, chunk_cobjects, storage):
        print(f'Deduplicating formulas chunk {chunk_i}')
        chunk = set()
        for cobject in chunk_cobjects:
            formulas_chunk_part = read_cloud_object_with_retry(
                storage, cobject, deserialise)
            chunk.update(formulas_chunk_part)

        return chunk
Пример #7
0
 def run_ranking(target_cobject, decoy_cobject, storage):
     target = read_cloud_object_with_retry(storage, target_cobject,
                                           deserialise)
     decoy = read_cloud_object_with_retry(storage, decoy_cobject,
                                          deserialise)
     merged = pd.concat(
         [target.assign(is_target=1),
          decoy.assign(is_target=0)],
         sort=False)
     merged = merged.sort_values('msm', ascending=False)
     decoy_cumsum = (merged.is_target == False).cumsum()
     target_cumsum = merged.is_target.cumsum()
     base_fdr = np.clip(decoy_cumsum / target_cumsum, 0, 1)
     base_fdr[np.isnan(base_fdr)] = 1
     target_fdrs = merged.assign(fdr=base_fdr)[lambda df: df.is_target == 1]
     target_fdrs = target_fdrs.drop('is_target', axis=1)
     target_fdrs = target_fdrs.sort_values('msm')
     target_fdrs = target_fdrs.assign(
         fdr=np.minimum.accumulate(target_fdrs.fdr))
     target_fdrs = target_fdrs.sort_index()
     return target_fdrs
Пример #8
0
    def get_target_images(images_cobject, storage):
        images = {}
        segm_images = read_cloud_object_with_retry(storage, images_cobject,
                                                   deserialise)

        for k, imgs in segm_images.items():
            if k in targets:
                if only_first_isotope:
                    imgs = imgs[:1]
                if as_png:
                    imgs = [
                        to_png(img, mask) if img is not None else None
                        for img in imgs
                    ]
                images[k] = imgs
        return images
Пример #9
0
    def segment_centr_chunk(cobject, id, storage):
        print(f'Segmenting clipped centroids dataframe chunk {id}')
        centr_df = read_cloud_object_with_retry(storage, cobject, deserialise)
        centr_segm_df = segment_centr_df(centr_df,
                                         first_level_centr_segm_bounds)

        def _first_level_upload(args):
            segm_i, df = args
            del df['segm_i']
            return segm_i, storage.put_cloudobject(serialise(df))

        with ThreadPoolExecutor(max_workers=128) as pool:
            sub_segms = [(segm_i, df)
                         for segm_i, df in centr_segm_df.groupby('segm_i')]
            sub_segms_cobjects = list(pool.map(_first_level_upload, sub_segms))

        return dict(sub_segms_cobjects)
Пример #10
0
def read_ds_segment(cobject, hybrid_impl, storage):
    data = read_cloud_object_with_retry(storage, cobject, deserialise)

    if isinstance(data, list):
        if isinstance(data[0], np.ndarray):
            data = np.concatenate(data)
        else:
            data = pd.concat(data, ignore_index=True, sort=False)

    if isinstance(data, np.ndarray):
        data = pd.DataFrame({
            'mz': data[:, 1],
            'int': data[:, 2],
            'sp_i': data[:, 0],
        })

    return data
Пример #11
0
    def run_fdr(db_data_cobject):
        db, fdr, formula_map_df = read_cloud_object_with_retry(
            storage, db_data_cobject, deserialise)

        formula_msm = formula_map_df.merge(msms_df,
                                           how='inner',
                                           left_on='formula_i',
                                           right_index=True)
        modifiers = fdr.target_modifiers_df[[
            'neutral_loss', 'adduct'
        ]].rename(columns={'neutral_loss': 'modifier'})
        results_df = (fdr.estimate_fdr(formula_msm).assign(
            database_path=db).set_index('formula_i').rename(columns={
                'modifier': 'combined_modifier',
                'formula': 'mol'
            }).merge(modifiers, left_on='combined_modifier',
                     right_index=True).drop(columns=['combined_modifier']))
        return results_df
    def segment_spectra_chunk(chunk_cobject, id, storage):
        print(f'Segmenting spectra chunk {id}')
        sp_mz_int_buf = read_cloud_object_with_retry(storage, chunk_cobject,
                                                     msgpack.load)

        def _first_level_segment_upload(segm_i):
            l = ds_segments_bounds[segm_i][0, 0]
            r = ds_segments_bounds[segm_i][-1, 1]
            segm_start, segm_end = np.searchsorted(
                sp_mz_int_buf[:, 1], (l, r))  # mz expected to be in column 1
            segm = sp_mz_int_buf[segm_start:segm_end]
            return storage.put_cobject(msgpack.dumps(segm))

        with ThreadPoolExecutor(max_workers=128) as pool:
            sub_segms_cobjects = list(
                pool.map(_first_level_segment_upload,
                         range(len(ds_segments_bounds))))

        return sub_segms_cobjects
Пример #13
0
    def process_centr_segment(db_segm_cobject, id, storage):
        print(f'Reading centroids segment {id}')
        # read database relevant part
        centr_df = read_cloud_object_with_retry(storage, db_segm_cobject,
                                                deserialise)

        # find range of datasets
        first_ds_segm_i, last_ds_segm_i = choose_ds_segments(
            ds_segments_bounds, centr_df, ppm)
        print(f'Reading dataset segments {first_ds_segm_i}-{last_ds_segm_i}')
        # read all segments in loop from COS
        sp_arr = read_ds_segments(
            ds_segms_cobjects[first_ds_segm_i:last_ds_segm_i + 1],
            ds_segms_len[first_ds_segm_i:last_ds_segm_i + 1], pw_mem_mb,
            ds_segm_size_mb, ds_segm_dtype, hybrid_impl, storage)

        formula_images_it = gen_iso_images(sp_inds=sp_arr.sp_i.values,
                                           sp_mzs=sp_arr.mz.values,
                                           sp_ints=sp_arr.int.values,
                                           centr_df=centr_df,
                                           nrows=nrows,
                                           ncols=ncols,
                                           ppm=ppm,
                                           min_px=1)
        if hybrid_impl:
            safe_mb = pw_mem_mb // 2
        else:
            safe_mb = 1024
        max_formula_images_mb = (
            pw_mem_mb - safe_mb -
            (last_ds_segm_i - first_ds_segm_i + 1) * ds_segm_size_mb) // 3
        print(f'Max formula_images size: {max_formula_images_mb} mb')
        images_manager = ImagesManager(storage,
                                       max_formula_images_mb * 1024**2)
        formula_image_metrics(formula_images_it, compute_metrics,
                              images_manager)
        images_cloud_objs = images_manager.finish()

        print(f'Centroids segment {id} finished')
        formula_metrics_df = pd.DataFrame.from_dict(
            images_manager.formula_metrics, orient='index')
        formula_metrics_df.index.name = 'formula_i'
        return formula_metrics_df, images_cloud_objs
    def get_segm_bounds(storage):
        imzml_reader = pickle.loads(
            read_cloud_object_with_retry(storage, imzml_cobject))
        sp_n = len(imzml_reader.coordinates)
        sample_sp_inds = np.random.choice(np.arange(sp_n), min(sp_n, sample_n))
        print(f'Sampling {len(sample_sp_inds)} spectra')
        spectra_sample = list(
            get_spectra(ibd_url, imzml_reader, sample_sp_inds))

        spectra_mzs = np.concatenate(
            [mzs for sp_id, mzs, ints in spectra_sample])
        print(f'Got {len(spectra_mzs)} mzs')

        total_size = 3 * spectra_mzs.nbytes * sp_n / len(sample_sp_inds)

        segm_n = int(np.ceil(total_size / (ds_segm_size_mb * 2**20)))

        segm_bounds_q = [i * 1 / segm_n for i in range(0, segm_n + 1)]
        segm_lower_bounds = [
            np.quantile(spectra_mzs, q) for q in segm_bounds_q
        ]
        return np.array(
            list(zip(segm_lower_bounds[:-1], segm_lower_bounds[1:])))
Пример #15
0
 def _get(cobj):
     formula_chunk = read_cloud_object_with_retry(
         storage, cobj, deserialise)
     formula_to_id_chunk = dict(
         zip(formula_chunk.values, formula_chunk.index))
     return formula_to_id_chunk
Пример #16
0
 def _get_mols(mols_cobj):
     return read_cloud_object_with_retry(storage, mols_cobj, deserialise)
Пример #17
0
 def _merge(ch_i):
     segm_spectra_chunk = read_cloud_object_with_retry(
         storage, segm_cobjects[ch_i], deserialise)
     return segm_spectra_chunk
 def _merge(cobject):
     segm_centr_df_chunk = read_cloud_object_with_retry(
         storage, cobject, pd.read_msgpack)
     return segm_centr_df_chunk
 def _merge(ch_i):
     segm_spectra_chunk = read_cloud_object_with_retry(
         storage, segm_cobjects[ch_i], msgpack.load)
     return segm_spectra_chunk
Пример #20
0
 def _merge(cobject):
     segm_centr_df_chunk = read_cloud_object_with_retry(
         storage, cobject, deserialise)
     return segm_centr_df_chunk