def define_centr_segments(pw, clip_centr_chunks_cobjects, centr_n, ds_segm_n,
                          ds_segm_size_mb):
    logger.info('Defining centroids segments bounds')

    def get_first_peak_mz(cobject, id, storage):
        print(
            f'Extracting first peak mz values from clipped centroids dataframe {id}'
        )
        centr_df = read_cloud_object_with_retry(storage, cobject,
                                                pd.read_msgpack)
        first_peak_df = centr_df[centr_df.peak_i == 0]
        return first_peak_df.mz.values

    memory_capacity_mb = 512
    futures = pw.map(get_first_peak_mz,
                     clip_centr_chunks_cobjects,
                     runtime_memory=memory_capacity_mb)
    first_peak_df_mz = np.concatenate(pw.get_result(futures))
    append_pywren_stats(futures, memory_mb=memory_capacity_mb)

    ds_size_mb = ds_segm_n * ds_segm_size_mb
    data_per_centr_segm_mb = 50
    peaks_per_centr_segm = 1e4
    centr_segm_n = int(
        max(ds_size_mb // data_per_centr_segm_mb,
            centr_n // peaks_per_centr_segm, 32))

    segm_bounds_q = [i * 1 / centr_segm_n for i in range(0, centr_segm_n)]
    centr_segm_lower_bounds = np.quantile(first_peak_df_mz, segm_bounds_q)

    logger.info(
        f'Generated {len(centr_segm_lower_bounds)} centroids bounds: {centr_segm_lower_bounds[0]}...{centr_segm_lower_bounds[-1]}'
    )
    return centr_segm_lower_bounds
def define_ds_segments(pw, ibd_url, imzml_cobject, ds_segm_size_mb, sample_n):
    def get_segm_bounds(storage):
        imzml_reader = pickle.loads(
            read_cloud_object_with_retry(storage, imzml_cobject))
        sp_n = len(imzml_reader.coordinates)
        sample_sp_inds = np.random.choice(np.arange(sp_n), min(sp_n, sample_n))
        print(f'Sampling {len(sample_sp_inds)} spectra')
        spectra_sample = list(
            get_spectra(ibd_url, imzml_reader, sample_sp_inds))

        spectra_mzs = np.concatenate(
            [mzs for sp_id, mzs, ints in spectra_sample])
        print(f'Got {len(spectra_mzs)} mzs')

        total_size = 3 * spectra_mzs.nbytes * sp_n / len(sample_sp_inds)

        segm_n = int(np.ceil(total_size / (ds_segm_size_mb * 2**20)))

        segm_bounds_q = [i * 1 / segm_n for i in range(0, segm_n + 1)]
        segm_lower_bounds = [
            np.quantile(spectra_mzs, q) for q in segm_bounds_q
        ]
        return np.array(
            list(zip(segm_lower_bounds[:-1], segm_lower_bounds[1:])))

    logger.info('Defining dataset segments bounds')
    memory_capacity_mb = 1024
    future = pw.call_async(get_segm_bounds, [],
                           runtime_memory=memory_capacity_mb)
    ds_segments = pw.get_result(future)
    append_pywren_stats(future, memory_mb=memory_capacity_mb)
    return ds_segments
def clip_centr_df(pw, bucket, centr_chunks_prefix, mz_min, mz_max):
    def clip_centr_df_chunk(obj, storage):
        print(f'Clipping centroids dataframe chunk {obj.key}')
        centroids_df_chunk = pd.read_msgpack(
            obj.data_stream._raw_stream).sort_values('mz')
        centroids_df_chunk = centroids_df_chunk[centroids_df_chunk.mz > 0]

        ds_mz_range_unique_formulas = centroids_df_chunk[
            (mz_min < centroids_df_chunk.mz)
            & (centroids_df_chunk.mz < mz_max)].index.unique()
        centr_df_chunk = centroids_df_chunk[centroids_df_chunk.index.isin(
            ds_mz_range_unique_formulas)].reset_index()
        clip_centr_chunk_cobject = storage.put_cobject(
            centr_df_chunk.to_msgpack())

        return clip_centr_chunk_cobject, centr_df_chunk.shape[0]

    memory_capacity_mb = 512
    futures = pw.map(clip_centr_df_chunk,
                     f'cos://{bucket}/{centr_chunks_prefix}/',
                     runtime_memory=memory_capacity_mb)
    clip_centr_chunks_cobjects, centr_n = list(zip(*pw.get_result(futures)))
    append_pywren_stats(futures,
                        memory_mb=memory_capacity_mb,
                        cloud_objects_n=len(futures))

    clip_centr_chunks_cobjects = list(clip_centr_chunks_cobjects)
    centr_n = sum(centr_n)
    logger.info(f'Prepared {centr_n} centroids')
    return clip_centr_chunks_cobjects, centr_n
示例#4
0
    def annotate(self):
        annotations_cache_key = f'{self.cacher.prefix}/annotate.cache'

        if self.cacher.exists(annotations_cache_key):
            self.formula_metrics_df, self.images_cloud_objs = self.cacher.load(
                annotations_cache_key)
            logger.info(
                f'Loaded {self.formula_metrics_df.shape[0]} metrics from cache'
            )
        else:
            logger.info('Annotating...')
            memory_capacity_mb = 4096 if self.is_intensive_dataset else 2048
            process_centr_segment = create_process_segment(
                self.ds_segms_cobjects, self.ds_segments_bounds,
                self.ds_segms_len, self.imzml_reader, self.image_gen_config,
                memory_capacity_mb, self.ds_segm_size_mb)

            futures = self.pywren_executor.map(
                process_centr_segment,
                self.db_segms_cobjects,
                runtime_memory=memory_capacity_mb)
            formula_metrics_list, images_cloud_objs = zip(
                *self.pywren_executor.get_result(futures))
            self.formula_metrics_df = pd.concat(formula_metrics_list)
            self.images_cloud_objs = list(chain(*images_cloud_objs))
            append_pywren_stats(futures,
                                memory_mb=memory_capacity_mb,
                                cloud_objects_n=len(self.images_cloud_objs))
            logger.info(
                f'Metrics calculated: {self.formula_metrics_df.shape[0]}')
            self.cacher.save((self.formula_metrics_df, self.images_cloud_objs),
                             annotations_cache_key)
示例#5
0
    def annotate(self):
        logger.info('Annotating...')
        clean_from_cos(self.config, self.config["storage"]["output_bucket"],
                       self.output["formula_images"])

        memory_capacity_mb = 2048  # TODO: Detect when this isn't enough and bump it up to 4096
        process_centr_segment = create_process_segment(
            self.config["storage"]["ds_bucket"],
            self.config["storage"]["output_bucket"],
            self.input_data["ds_segments"], self.ds_segments_bounds,
            self.ds_segms_len, self.coordinates, self.image_gen_config,
            memory_capacity_mb, self.ds_segm_size_mb,
            self.imzml_parser.mzPrecision)

        futures = self.pywren_executor.map(
            process_centr_segment,
            f'{self.config["storage"]["db_bucket"]}/{self.input_db["centroids_segments"]}/',
            runtime_memory=memory_capacity_mb)
        formula_metrics_list, images_cloud_objs = zip(
            *self.pywren_executor.get_result(futures))
        self.formula_metrics_df = pd.concat(formula_metrics_list)
        self.images_cloud_objs = list(chain(*images_cloud_objs))
        append_pywren_stats(futures,
                            memory=memory_capacity_mb,
                            plus_objects=len(self.images_cloud_objs))

        logger.info(f'Metrics calculated: {self.formula_metrics_df.shape[0]}')
示例#6
0
def calculate_fdrs(pw, rankings_df):
    def run_ranking(target_cobject, decoy_cobject, storage):
        target = pickle.loads(
            read_cloud_object_with_retry(storage, target_cobject))
        decoy = pickle.loads(
            read_cloud_object_with_retry(storage, decoy_cobject))
        merged = pd.concat(
            [target.assign(is_target=1),
             decoy.assign(is_target=0)],
            sort=False)
        merged = merged.sort_values('msm', ascending=False)
        decoy_cumsum = (merged.is_target == False).cumsum()
        target_cumsum = merged.is_target.cumsum()
        base_fdr = np.clip(decoy_cumsum / target_cumsum, 0, 1)
        base_fdr[np.isnan(base_fdr)] = 1
        target_fdrs = merged.assign(fdr=base_fdr)[lambda df: df.is_target == 1]
        target_fdrs = target_fdrs.drop('is_target', axis=1)
        target_fdrs = target_fdrs.sort_values('msm')
        target_fdrs = target_fdrs.assign(
            fdr=np.minimum.accumulate(target_fdrs.fdr))
        target_fdrs = target_fdrs.sort_index()
        return target_fdrs

    def merge_rankings(target_row, decoy_cobjects, storage):
        print("Merging rankings...")
        print(target_row)
        rankings = [
            run_ranking(target_row.cobject, decoy_cobject, storage)
            for decoy_cobject in decoy_cobjects
        ]
        mols = (pd.concat(rankings).rename_axis(
            'formula_i').reset_index().groupby('formula_i').agg({
                'fdr': np.nanmedian,
                'mol': 'first'
            }).assign(database_path=target_row.database_path,
                      adduct=target_row.adduct,
                      modifier=target_row.modifier))
        return mols

    ranking_jobs = []
    for group_i, group in rankings_df.groupby('group_i'):
        target_rows = group[group.is_target]
        decoy_rows = group[~group.is_target]

        for i, target_row in target_rows.iterrows():
            ranking_jobs.append([target_row, decoy_rows.cobject.tolist()])

    memory_capacity_mb = 256
    futures = pw.map(merge_rankings,
                     ranking_jobs,
                     runtime_memory=memory_capacity_mb)
    results = pw.get_result(futures)
    append_pywren_stats(futures, memory_mb=memory_capacity_mb)

    return pd.concat(results)
def calculate_centroids(config, input_db, polarity='+', isocalc_sigma=0.001238):
    bucket = config["storage"]["db_bucket"]
    formulas_chunks_prefix = input_db["formulas_chunks"]
    centroids_chunks_prefix = input_db["centroids_chunks"]
    clean_from_cos(config, bucket, centroids_chunks_prefix)

    def calculate_peaks_for_formula(formula_i, formula):
        mzs, ints = isocalc_wrapper.centroids(formula)
        if mzs is not None:
            return list(zip(repeat(formula_i), range(len(mzs)), mzs, ints))
        else:
            return []

    def calculate_peaks_chunk(obj, id, storage):
        print(f'Calculating peaks from formulas chunk {obj.key}')
        chunk_df = pd.read_msgpack(obj.data_stream._raw_stream)
        peaks = [peak for formula_i, formula in chunk_df.formula.items()
                 for peak in calculate_peaks_for_formula(formula_i, formula)]
        peaks_df = pd.DataFrame(peaks, columns=['formula_i', 'peak_i', 'mz', 'int'])
        peaks_df.set_index('formula_i', inplace=True)

        print(f'Storing centroids chunk {id}')
        centroids_chunk_key = f'{centroids_chunks_prefix}/{id}.msgpack'
        storage.put_object(Bucket=bucket, Key=centroids_chunk_key, Body=peaks_df.to_msgpack())

        return peaks_df.shape[0]

    from annotation_pipeline.isocalc_wrapper import IsocalcWrapper # Import lazily so that the rest of the pipeline still works if the dependency is missing
    isocalc_wrapper = IsocalcWrapper({
        # These instrument settings are usually customized on a per-dataset basis out of a set of
        # 18 possible combinations, but most of EMBL's datasets are compatible with the following settings:
        'charge': {
            'polarity': polarity,
            'n_charges': 1,
        },
        'isocalc_sigma': float(f"{isocalc_sigma:f}") # Rounding to match production implementation
    })

    pw = pywren.ibm_cf_executor(config=config)
    memory_capacity_mb = 2048
    futures = pw.map(calculate_peaks_chunk, f'cos://{bucket}/{formulas_chunks_prefix}/', runtime_memory=memory_capacity_mb)
    centroids_chunks_n = pw.get_result(futures)
    append_pywren_stats(futures, memory_mb=memory_capacity_mb, cloud_objects_n=len(futures))

    num_centroids = sum(centroids_chunks_n)
    n_centroids_chunks = len(centroids_chunks_n)
    logger.info(f'Calculated {num_centroids} centroids in {n_centroids_chunks} chunks')
    return num_centroids, n_centroids_chunks
def get_imzml_reader(pw, imzml_path):
    def get_portable_imzml_reader(storage):
        imzml_stream = requests.get(imzml_path, stream=True).raw
        parser = ImzMLParser(imzml_stream, ibd_file=None)
        imzml_reader = parser.portable_spectrum_reader()
        imzml_cobject = storage.put_cobject(pickle.dumps(imzml_reader))
        return imzml_reader, imzml_cobject

    memory_capacity_mb = 1024
    future = pw.call_async(get_portable_imzml_reader, [])
    imzml_reader, imzml_cobject = pw.get_result(future)
    append_pywren_stats(future,
                        memory_mb=memory_capacity_mb,
                        cloud_objects_n=1)

    return imzml_reader, imzml_cobject
def clip_centr_df(pw, bucket, centr_chunks_prefix, clip_centr_chunk_prefix, mz_min, mz_max):
    def clip_centr_df_chunk(obj, id, ibm_cos):
        print(f'Clipping centroids dataframe chunk {obj.key}')
        centroids_df_chunk = pd.read_msgpack(obj.data_stream._raw_stream).sort_values('mz')
        centroids_df_chunk = centroids_df_chunk[centroids_df_chunk.mz > 0]

        ds_mz_range_unique_formulas = centroids_df_chunk[(mz_min < centroids_df_chunk.mz) &
                                                         (centroids_df_chunk.mz < mz_max)].index.unique()
        centr_df_chunk = centroids_df_chunk[centroids_df_chunk.index.isin(ds_mz_range_unique_formulas)].reset_index()
        ibm_cos.put_object(Bucket=bucket,
                           Key=f'{clip_centr_chunk_prefix}/{id}.msgpack',
                           Body=centr_df_chunk.to_msgpack())

        return centr_df_chunk.shape[0]

    memory_capacity_mb = 512
    futures = pw.map(clip_centr_df_chunk, f'{bucket}/{centr_chunks_prefix}/', runtime_memory=memory_capacity_mb)
    centr_n = sum(pw.get_result(futures))
    append_pywren_stats(futures, memory=memory_capacity_mb, plus_objects=len(futures))

    logger.info(f'Prepared {centr_n} centroids')
    return centr_n
def define_centr_segments(pw, bucket, clip_centr_chunk_prefix, centr_n, ds_segm_n, ds_segm_size_mb):
    logger.info('Defining centroids segments bounds')

    def get_first_peak_mz(obj):
        print(f'Extracting first peak mz values from clipped centroids dataframe {obj.key}')
        centr_df = pd.read_msgpack(obj.data_stream._raw_stream)
        first_peak_df = centr_df[centr_df.peak_i == 0]
        return first_peak_df.mz.values

    memory_capacity_mb = 512
    futures = pw.map(get_first_peak_mz, f'{bucket}/{clip_centr_chunk_prefix}/', runtime_memory=memory_capacity_mb)
    first_peak_df_mz = np.concatenate(pw.get_result(futures))
    append_pywren_stats(futures, memory=memory_capacity_mb)

    ds_size_mb = ds_segm_n * ds_segm_size_mb
    data_per_centr_segm_mb = 50
    peaks_per_centr_segm = 1e4
    centr_segm_n = int(max(ds_size_mb // data_per_centr_segm_mb, centr_n // peaks_per_centr_segm, 32))

    segm_bounds_q = [i * 1 / centr_segm_n for i in range(0, centr_segm_n)]
    centr_segm_lower_bounds = np.quantile(first_peak_df_mz, segm_bounds_q)

    logger.info(f'Generated {len(centr_segm_lower_bounds)} centroids bounds: {centr_segm_lower_bounds[0]}...{centr_segm_lower_bounds[-1]}')
    return centr_segm_lower_bounds
def chunk_spectra(pw, ibd_path, imzml_cobject, imzml_reader):
    MAX_CHUNK_SIZE = 512 * 1024**2  # 512MB

    sp_id_to_idx = get_pixel_indices(imzml_reader.coordinates)
    row_size = 3 * max(4,
                       np.dtype(imzml_reader.mzPrecision).itemsize,
                       np.dtype(imzml_reader.intensityPrecision).itemsize)

    def plan_chunks():
        chunk_sp_inds = []

        estimated_size_mb = 0
        # Iterate in the same order that intensities are laid out in the file, hopefully this will
        # prevent fragmented read patterns
        for sp_i in np.argsort(imzml_reader.intensityOffsets):
            spectrum_size = imzml_reader.mzLengths[sp_i] * row_size
            if estimated_size_mb + spectrum_size > MAX_CHUNK_SIZE:
                estimated_size_mb = 0
                yield np.array(chunk_sp_inds)
                chunk_sp_inds = []

            estimated_size_mb += spectrum_size
            chunk_sp_inds.append(sp_i)

        if chunk_sp_inds:
            yield np.array(chunk_sp_inds)

    def upload_chunk(ch_i, storage):
        chunk_sp_inds = chunks[ch_i]
        # Get imzml_reader from COS because it's too big to include via pywren captured vars
        imzml_reader = pickle.loads(
            read_cloud_object_with_retry(storage, imzml_cobject))
        n_spectra = sum(imzml_reader.mzLengths[sp_i] for sp_i in chunk_sp_inds)
        sp_mz_int_buf = np.zeros((n_spectra, 3),
                                 dtype=imzml_reader.mzPrecision)

        chunk_start = 0
        for sp_i, mzs, ints in get_spectra(ibd_path, imzml_reader,
                                           chunk_sp_inds):
            chunk_end = chunk_start + len(mzs)
            sp_mz_int_buf[chunk_start:chunk_end, 0] = sp_id_to_idx[sp_i]
            sp_mz_int_buf[chunk_start:chunk_end, 1] = mzs
            sp_mz_int_buf[chunk_start:chunk_end, 2] = ints
            chunk_start = chunk_end

        by_mz = np.argsort(sp_mz_int_buf[:, 1])
        sp_mz_int_buf = sp_mz_int_buf[by_mz]
        del by_mz

        chunk = msgpack.dumps(sp_mz_int_buf)
        size = sys.getsizeof(chunk) * (1 / 1024**2)
        logger.info(f'Uploading spectra chunk {ch_i} - %.2f MB' % size)
        chunk_cobject = storage.put_cobject(chunk)
        logger.info(f'Spectra chunk {ch_i} finished')
        return chunk_cobject

    chunks = list(plan_chunks())
    memory_capacity_mb = 3072
    futures = pw.map(upload_chunk,
                     range(len(chunks)),
                     runtime_memory=memory_capacity_mb)
    ds_chunks_cobjects = pw.get_result(futures)
    append_pywren_stats(futures,
                        memory_mb=memory_capacity_mb,
                        cloud_objects_n=len(chunks))

    return ds_chunks_cobjects
def segment_centroids(pw, clip_centr_chunks_cobjects, centr_segm_lower_bounds,
                      ds_segms_bounds, ds_segm_size_mb,
                      max_ds_segms_size_per_db_segm_mb, ppm):
    centr_segm_n = len(centr_segm_lower_bounds)
    centr_segm_lower_bounds = centr_segm_lower_bounds.copy()

    # define first level segmentation and then segment each one into desired number
    first_level_centr_segm_n = min(32, len(centr_segm_lower_bounds))
    centr_segm_lower_bounds = np.array_split(centr_segm_lower_bounds,
                                             first_level_centr_segm_n)
    first_level_centr_segm_bounds = np.array(
        [bounds[0] for bounds in centr_segm_lower_bounds])

    def segment_centr_df(centr_df, db_segm_lower_bounds):
        first_peak_df = centr_df[centr_df.peak_i == 0].copy()
        segment_mapping = np.searchsorted(
            db_segm_lower_bounds, first_peak_df.mz.values, side='right') - 1
        first_peak_df['segm_i'] = segment_mapping
        centr_segm_df = pd.merge(centr_df,
                                 first_peak_df[['formula_i', 'segm_i']],
                                 on='formula_i').sort_values('mz')
        return centr_segm_df

    def segment_centr_chunk(cobject, id, storage):
        print(f'Segmenting clipped centroids dataframe chunk {id}')
        centr_df = read_cloud_object_with_retry(storage, cobject,
                                                pd.read_msgpack)
        centr_segm_df = segment_centr_df(centr_df,
                                         first_level_centr_segm_bounds)

        def _first_level_upload(args):
            segm_i, df = args
            del df['segm_i']
            return segm_i, storage.put_cobject(df.to_msgpack())

        with ThreadPoolExecutor(max_workers=128) as pool:
            sub_segms = [(segm_i, df)
                         for segm_i, df in centr_segm_df.groupby('segm_i')]
            sub_segms_cobjects = list(pool.map(_first_level_upload, sub_segms))

        return dict(sub_segms_cobjects)

    memory_capacity_mb = 512
    first_futures = pw.map(segment_centr_chunk,
                           clip_centr_chunks_cobjects,
                           runtime_memory=memory_capacity_mb)
    first_level_segms_cobjects = pw.get_result(first_futures)
    append_pywren_stats(first_futures,
                        memory_mb=memory_capacity_mb,
                        cloud_objects_n=len(first_futures) *
                        len(centr_segm_lower_bounds))

    def merge_centr_df_segments(segm_cobjects, id, storage):
        print(f'Merging segment {id} clipped centroids chunks')

        def _merge(cobject):
            segm_centr_df_chunk = read_cloud_object_with_retry(
                storage, cobject, pd.read_msgpack)
            return segm_centr_df_chunk

        with ThreadPoolExecutor(max_workers=128) as pool:
            segm = pd.concat(list(pool.map(_merge, segm_cobjects)))

        def _second_level_segment(segm, sub_segms_n):
            segm_bounds_q = [
                i * 1 / sub_segms_n for i in range(0, sub_segms_n)
            ]
            sub_segms_lower_bounds = np.quantile(
                segm[segm.peak_i == 0].mz.values, segm_bounds_q)
            centr_segm_df = segment_centr_df(segm, sub_segms_lower_bounds)

            sub_segms = []
            for segm_i, df in centr_segm_df.groupby('segm_i'):
                del df['segm_i']
                sub_segms.append(df)
            return sub_segms

        init_segms = _second_level_segment(segm,
                                           len(centr_segm_lower_bounds[id]))

        from annotation_pipeline.image import choose_ds_segments
        segms = []
        for init_segm in init_segms:
            first_ds_segm_i, last_ds_segm_i = choose_ds_segments(
                ds_segms_bounds, init_segm, ppm)
            ds_segms_to_download_n = last_ds_segm_i - first_ds_segm_i + 1
            segms.append((ds_segms_to_download_n, init_segm))
        segms = sorted(segms, key=lambda x: x[0], reverse=True)
        max_ds_segms_to_download_n, max_segm = segms[0]

        max_iterations_n = 100
        iterations_n = 1
        while max_ds_segms_to_download_n * ds_segm_size_mb > max_ds_segms_size_per_db_segm_mb and iterations_n < max_iterations_n:

            sub_segms = []
            sub_segms_n = math.ceil(max_ds_segms_to_download_n *
                                    ds_segm_size_mb /
                                    max_ds_segms_size_per_db_segm_mb)
            for sub_segm in _second_level_segment(max_segm, sub_segms_n):
                first_ds_segm_i, last_ds_segm_i = choose_ds_segments(
                    ds_segms_bounds, sub_segm, ppm)
                ds_segms_to_download_n = last_ds_segm_i - first_ds_segm_i + 1
                sub_segms.append((ds_segms_to_download_n, sub_segm))

            segms = sub_segms + segms[1:]
            segms = sorted(segms, key=lambda x: x[0], reverse=True)
            iterations_n += 1
            max_ds_segms_to_download_n, max_segm = segms[0]

        def _second_level_upload(df):
            return storage.put_cobject(df.to_msgpack())

        print(f'Storing {len(segms)} centroids segments')
        with ThreadPoolExecutor(max_workers=128) as pool:
            segms = [df for _, df in segms]
            segms_cobjects = list(pool.map(_second_level_upload, segms))

        return segms_cobjects

    from collections import defaultdict
    second_level_segms_cobjects = defaultdict(list)
    for sub_segms_cobjects in first_level_segms_cobjects:
        for first_level_segm_i in sub_segms_cobjects:
            second_level_segms_cobjects[first_level_segm_i].append(
                sub_segms_cobjects[first_level_segm_i])
    second_level_segms_cobjects = sorted(second_level_segms_cobjects.items(),
                                         key=lambda x: x[0])
    second_level_segms_cobjects = [
        [cobjects] for segm_i, cobjects in second_level_segms_cobjects
    ]

    memory_capacity_mb = 2048
    second_futures = pw.map(merge_centr_df_segments,
                            second_level_segms_cobjects,
                            runtime_memory=memory_capacity_mb)
    db_segms_cobjects = list(np.concatenate(pw.get_result(second_futures)))
    append_pywren_stats(second_futures,
                        memory_mb=memory_capacity_mb,
                        cloud_objects_n=centr_segm_n)

    return db_segms_cobjects
def segment_spectra(pw, bucket, ds_chunks_prefix, ds_segments_prefix, ds_segments_bounds, ds_segm_size_mb, segm_dtype):
    ds_segm_n = len(ds_segments_bounds)

    # extend boundaries of the first and last segments
    # to include all mzs outside of the spectra sample mz range
    ds_segments_bounds = ds_segments_bounds.copy()
    ds_segments_bounds[0, 0] = 0
    ds_segments_bounds[-1, 1] = MAX_MZ_VALUE

    # define first level segmentation and then segment each one into desired number
    first_level_segm_size_mb = 512
    first_level_segm_n = (len(ds_segments_bounds) * ds_segm_size_mb) // first_level_segm_size_mb
    first_level_segm_n = max(first_level_segm_n, 1)
    ds_segments_bounds = np.array_split(ds_segments_bounds, first_level_segm_n)

    def segment_spectra_chunk(obj, id, ibm_cos):
        print(f'Segmenting spectra chunk {obj.key}')
        sp_mz_int_buf = msgpack.loads(obj.data_stream.read())

        def _first_level_segment_upload(segm_i):
            l = ds_segments_bounds[segm_i][0, 0]
            r = ds_segments_bounds[segm_i][-1, 1]
            segm_start, segm_end = np.searchsorted(sp_mz_int_buf[:, 1], (l, r))  # mz expected to be in column 1
            segm = sp_mz_int_buf[segm_start:segm_end]
            ibm_cos.put_object(Bucket=bucket,
                               Key=f'{ds_segments_prefix}/chunk/{segm_i}/{id}.msgpack',
                               Body=msgpack.dumps(segm))

        with ThreadPoolExecutor(max_workers=128) as pool:
            pool.map(_first_level_segment_upload, range(len(ds_segments_bounds)))

    memory_safe_mb = 1024
    memory_capacity_mb = first_level_segm_size_mb * 2 + memory_safe_mb
    first_futures = pw.map(segment_spectra_chunk, f'{bucket}/{ds_chunks_prefix}/', runtime_memory=memory_capacity_mb)
    pw.get_result(first_futures)
    if not isinstance(first_futures, list): first_futures = [first_futures]
    append_pywren_stats(first_futures, memory=memory_capacity_mb, plus_objects=len(first_futures) * len(ds_segments_bounds))

    def merge_spectra_chunk_segments(segm_i, ibm_cos):
        print(f'Merging segment {segm_i} spectra chunks')

        keys = list_keys(bucket, f'{ds_segments_prefix}/chunk/{segm_i}/', ibm_cos)

        def _merge(key):
            segm_spectra_chunk = read_object_with_retry(ibm_cos, bucket, key, msgpack.load)
            return segm_spectra_chunk

        with ThreadPoolExecutor(max_workers=128) as pool:
            segm = list(pool.map(_merge, keys))

        segm = np.concatenate(segm)

        # Alternative in-place sorting (slower) :
        # segm.view(f'{segm_dtype},{segm_dtype},{segm_dtype}').sort(order=['f1'], axis=0)
        segm = segm[segm[:, 1].argsort()]

        clean_from_cos(None, bucket, f'{ds_segments_prefix}/chunk/{segm_i}/', ibm_cos)
        bounds_list = ds_segments_bounds[segm_i]

        segms_len = []
        for segm_j in range(len(bounds_list)):
            l, r = bounds_list[segm_j]
            segm_start, segm_end = np.searchsorted(segm[:, 1], (l, r))  # mz expected to be in column 1
            sub_segm = segm[segm_start:segm_end]
            segms_len.append(len(sub_segm))
            base_id = sum([len(bounds) for bounds in ds_segments_bounds[:segm_i]])
            id = base_id + segm_j
            print(f'Storing dataset segment {id}')
            ibm_cos.put_object(Bucket=bucket,
                               Key=f'{ds_segments_prefix}/{id}.msgpack',
                               Body=msgpack.dumps(sub_segm))

        return segms_len

    # same memory capacity
    second_futures = pw.map(merge_spectra_chunk_segments, range(len(ds_segments_bounds)), runtime_memory=memory_capacity_mb)
    ds_segms_len = list(np.concatenate(pw.get_result(second_futures)))
    append_pywren_stats(second_futures, memory=memory_capacity_mb, plus_objects=ds_segm_n, minus_objects=len(first_futures) * len(ds_segments_bounds))

    return ds_segm_n, ds_segms_len
def segment_centroids(pw, bucket, clip_centr_chunk_prefix, centr_segm_prefix, centr_segm_lower_bounds):
    centr_segm_n = len(centr_segm_lower_bounds)
    centr_segm_lower_bounds = centr_segm_lower_bounds.copy()

    # define first level segmentation and then segment each one into desired number
    first_level_centr_segm_n = min(32, len(centr_segm_lower_bounds))
    centr_segm_lower_bounds = np.array_split(centr_segm_lower_bounds, first_level_centr_segm_n)
    first_level_centr_segm_bounds = np.array([bounds[0] for bounds in centr_segm_lower_bounds])

    def segment_centr_df(centr_df, db_segm_lower_bounds):
        first_peak_df = centr_df[centr_df.peak_i == 0].copy()
        segment_mapping = np.searchsorted(db_segm_lower_bounds, first_peak_df.mz.values, side='right') - 1
        first_peak_df['segm_i'] = segment_mapping
        centr_segm_df = pd.merge(centr_df, first_peak_df[['formula_i', 'segm_i']], on='formula_i').sort_values('mz')
        return centr_segm_df

    def segment_centr_chunk(obj, id, ibm_cos):
        print(f'Segmenting clipped centroids dataframe chunk {obj.key}')
        centr_df = pd.read_msgpack(obj.data_stream._raw_stream)
        centr_segm_df = segment_centr_df(centr_df, first_level_centr_segm_bounds)

        def _first_level_upload(args):
            segm_i, df = args
            ibm_cos.put_object(Bucket=bucket,
                               Key=f'{centr_segm_prefix}/chunk/{segm_i}/{id}.msgpack',
                               Body=df.to_msgpack())

        with ThreadPoolExecutor(max_workers=128) as pool:
            pool.map(_first_level_upload, [(segm_i, df) for segm_i, df in centr_segm_df.groupby('segm_i')])

    memory_capacity_mb = 512
    first_futures = pw.map(segment_centr_chunk, f'{bucket}/{clip_centr_chunk_prefix}/', runtime_memory=memory_capacity_mb)
    pw.get_result(first_futures)
    append_pywren_stats(first_futures, memory=memory_capacity_mb,
                        plus_objects=len(first_futures) * len(centr_segm_lower_bounds))

    def merge_centr_df_segments(segm_i, ibm_cos):
        print(f'Merging segment {segm_i} clipped centroids chunks')

        keys = list_keys(bucket, f'{centr_segm_prefix}/chunk/{segm_i}/', ibm_cos)

        def _merge(key):
            segm_centr_df_chunk = read_object_with_retry(ibm_cos, bucket, key, pd.read_msgpack)
            return segm_centr_df_chunk

        with ThreadPoolExecutor(max_workers=128) as pool:
            segm = pd.concat(list(pool.map(_merge, keys)))
            del segm['segm_i']

        clean_from_cos(None, bucket, f'{centr_segm_prefix}/chunk/{segm_i}/', ibm_cos)
        centr_segm_df = segment_centr_df(segm, centr_segm_lower_bounds[segm_i])

        def _second_level_upload(args):
            segm_j, df = args
            base_id = sum([len(bounds) for bounds in centr_segm_lower_bounds[:segm_i]])
            id = base_id + segm_j
            print(f'Storing centroids segment {id}')
            ibm_cos.put_object(Bucket=bucket,
                               Key=f'{centr_segm_prefix}/{id}.msgpack',
                               Body=df.to_msgpack())

        with ThreadPoolExecutor(max_workers=128) as pool:
            pool.map(_second_level_upload, [(segm_i, df) for segm_i, df in centr_segm_df.groupby('segm_i')])

    memory_capacity_mb = 1024
    second_futures = pw.map(merge_centr_df_segments, range(len(centr_segm_lower_bounds)), runtime_memory=memory_capacity_mb)
    pw.get_result(second_futures)
    append_pywren_stats(second_futures, memory=memory_capacity_mb,
                        plus_objects=centr_segm_n, minus_objects=len(first_futures) * len(centr_segm_lower_bounds))

    return centr_segm_n
示例#15
0
def build_fdr_rankings(pw, bucket, input_data, input_db, formula_scores_df):
    def build_ranking(group_i, ranking_i, database, modifier, adduct, id,
                      storage):
        print("Building ranking...")
        print(f'job_i: {id}')
        print(f'ranking_i: {ranking_i}')
        print(f'database: {database}')
        print(f'modifier: {modifier}')
        print(f'adduct: {adduct}')
        # For every unmodified formula in `database`, look up the MSM score for the molecule
        # that it would become after the modifier and adduct are applied
        mols = pickle.loads(read_object_with_retry(storage, bucket, database))
        if adduct is not None:
            # Target rankings use the same adduct for all molecules
            mol_formulas = list(
                map(safe_generate_ion_formula, mols, repeat(modifier),
                    repeat(adduct)))
        else:
            # Decoy rankings use a consistent random adduct for each molecule, chosen so that it doesn't overlap
            # with other decoy rankings for this molecule
            adducts = _get_random_adduct_set(len(mols), decoy_adducts,
                                             ranking_i)
            mol_formulas = list(
                map(safe_generate_ion_formula, mols, repeat(modifier),
                    adducts))

        formula_to_id = {}
        keys = list_keys(bucket, f'{input_db["formula_to_id_chunks"]}/',
                         storage)
        for key in keys:
            formula_to_id_chunk = read_object_with_retry(
                storage, bucket, key, msgpack_load_text)

            for formula in mol_formulas:
                if formula_to_id_chunk.get(formula) is not None:
                    formula_to_id[formula] = formula_to_id_chunk.get(formula)

        formula_is = [
            formula and formula_to_id.get(formula) for formula in mol_formulas
        ]
        msm = [
            formula_i and msm_lookup.get(formula_i) for formula_i in formula_is
        ]
        if adduct is not None:
            ranking_df = pd.DataFrame({
                'mol': mols,
                'msm': msm
            },
                                      index=formula_is)
            ranking_df = ranking_df[~ranking_df.msm.isna()]
        else:
            # Specific molecules don't matter in the decoy rankings, only their msm distribution
            ranking_df = pd.DataFrame({'msm': msm})
            ranking_df = ranking_df[~ranking_df.msm.isna()]

        return id, storage.put_cobject(pickle.dumps(ranking_df))

    decoy_adducts = sorted(set(DECOY_ADDUCTS).difference(input_db['adducts']))
    n_decoy_rankings = input_data.get('num_decoys', len(decoy_adducts))
    msm_lookup = formula_scores_df.msm.to_dict(
    )  # Ideally this data would stay in COS so it doesn't have to be reuploaded

    # Create a job for each list of molecules to be ranked
    ranking_jobs = []
    for group_i, (database, modifier) in enumerate(
            product(input_db['databases'], input_db['modifiers'])):
        # Target and decoy rankings are treated differently. Decoy rankings are identified by not having an adduct.
        ranking_jobs.extend(
            (group_i, ranking_i, database, modifier, adduct)
            for ranking_i, adduct in enumerate(input_db['adducts']))
        ranking_jobs.extend((group_i, ranking_i, database, modifier, None)
                            for ranking_i in range(n_decoy_rankings))

    memory_capacity_mb = 1536
    futures = pw.map(build_ranking,
                     ranking_jobs,
                     runtime_memory=memory_capacity_mb)
    ranking_cobjects = [
        cobject for job_i, cobject in sorted(pw.get_result(futures))
    ]
    append_pywren_stats(futures,
                        memory_mb=memory_capacity_mb,
                        cloud_objects_n=len(futures))

    rankings_df = pd.DataFrame(ranking_jobs,
                               columns=[
                                   'group_i', 'ranking_i', 'database_path',
                                   'modifier', 'adduct'
                               ])
    rankings_df = rankings_df.assign(is_target=~rankings_df.adduct.isnull(),
                                     cobject=ranking_cobjects)

    return rankings_df
示例#16
0
def build_database(config, input_db):
    bucket = config["storage"]["db_bucket"]
    formulas_chunks_prefix = input_db["formulas_chunks"]
    clean_from_cos(config, bucket, formulas_chunks_prefix)

    adducts = [*input_db['adducts'], *DECOY_ADDUCTS]
    modifiers = input_db['modifiers']
    databases = input_db['databases']

    N_HASH_SEGMENTS = 32  # should be less than N_FORMULAS_SEGMENTS

    def hash_formula_to_segment(formula):
        m = hashlib.md5()
        m.update(formula.encode('utf-8'))
        return int(m.hexdigest(), 16) % N_HASH_SEGMENTS

    def generate_formulas(adduct, ibm_cos):
        print(f'Generating formulas for adduct {adduct}')

        def _get_mols(mols_key):
            return pickle.loads(
                read_object_with_retry(ibm_cos, bucket, mols_key))

        with ThreadPoolExecutor(max_workers=128) as pool:
            mols_list = list(pool.map(_get_mols, databases))

        formulas = set()

        for mols in mols_list:
            for modifier in modifiers:
                formulas.update(
                    map(safe_generate_ion_formula, mols, repeat(modifier),
                        repeat(adduct)))

        if None in formulas:
            formulas.remove(None)

        formulas_segments = {}
        for formula in formulas:
            segm_i = hash_formula_to_segment(formula)
            if segm_i in formulas_segments:
                formulas_segments[segm_i].append(formula)
            else:
                formulas_segments[segm_i] = [formula]

        def _store(segm_i):
            ibm_cos.put_object(
                Bucket=bucket,
                Key=f'{formulas_chunks_prefix}/chunk/{segm_i}/{adduct}.pickle',
                Body=pickle.dumps(formulas_segments[segm_i]))

        segments_n = [segm_i for segm_i in formulas_segments]
        with ThreadPoolExecutor(max_workers=128) as pool:
            pool.map(_store, segments_n)

        return segments_n

    pw = pywren.ibm_cf_executor(config=config)
    memory_capacity_mb = 512
    futures = pw.map(generate_formulas,
                     adducts,
                     runtime_memory=memory_capacity_mb)
    segments_n = list(set().union(*pw.get_result(futures)))
    append_pywren_stats(futures,
                        memory=memory_capacity_mb,
                        plus_objects=len(adducts) * len(segments_n))

    def deduplicate_formulas_segment(segm_i, ibm_cos, clean=True):
        print(f'Deduplicating formulas segment {segm_i}')
        keys = list_keys(bucket, f'{formulas_chunks_prefix}/chunk/{segm_i}/',
                         ibm_cos)

        segm = set()
        for key in keys:
            segm_formulas_chunk = pickle.loads(
                read_object_with_retry(ibm_cos, bucket, key))
            segm.update(segm_formulas_chunk)

        if clean:
            clean_from_cos(config, bucket,
                           f'{formulas_chunks_prefix}/chunk/{segm_i}/',
                           ibm_cos)

        return segm

    def get_formulas_number_per_chunk(segm_i, ibm_cos):
        segm = deduplicate_formulas_segment(segm_i, ibm_cos, clean=False)
        return len(segm)

    pw = pywren.ibm_cf_executor(config=config)
    memory_capacity_mb = 512
    futures = pw.map(get_formulas_number_per_chunk,
                     segments_n,
                     runtime_memory=memory_capacity_mb)
    formulas_nums = pw.get_result(futures)
    append_pywren_stats(futures, memory=memory_capacity_mb)

    def store_formulas_segment(segm_i, ibm_cos):
        segm = deduplicate_formulas_segment(segm_i, ibm_cos)
        formula_i_start = sum(formulas_nums[:segm_i])
        formula_i_end = formula_i_start + len(segm)
        segm = pd.DataFrame(sorted(segm),
                            columns=['formula'],
                            index=pd.RangeIndex(formula_i_start,
                                                formula_i_end,
                                                name='formula_i'))

        ibm_cos.put_object(
            Bucket=bucket,
            Key=f'{formulas_chunks_prefix}_fdr/{segm_i}.msgpack',
            Body=segm.to_msgpack())

        n_threads = N_FORMULAS_SEGMENTS // N_HASH_SEGMENTS
        subsegm_size = math.ceil(len(segm) / n_threads)
        segm_list = [
            segm[i:i + subsegm_size]
            for i in range(0, segm.shape[0], subsegm_size)
        ]

        def _store(segm_j):
            id = segm_i * n_threads + segm_j
            print(f'Storing formulas segment {id}')
            ibm_cos.put_object(Bucket=bucket,
                               Key=f'{formulas_chunks_prefix}/{id}.msgpack',
                               Body=segm_list[segm_j].to_msgpack())

        with ThreadPoolExecutor(max_workers=128) as pool:
            pool.map(_store, range(n_threads))

        return [len(segm) for segm in segm_list]

    pw = pywren.ibm_cf_executor(config=config)
    memory_capacity_mb = 512
    futures = pw.map(store_formulas_segment,
                     segments_n,
                     runtime_memory=memory_capacity_mb)
    results = pw.get_result(futures)
    append_pywren_stats(futures,
                        memory=memory_capacity_mb,
                        plus_objects=N_FORMULAS_SEGMENTS,
                        minus_objects=len(adducts) * len(segments_n))

    num_formulas = sum(formulas_nums)
    n_formulas_chunks = sum([len(result) for result in results])
    logger.info(
        f'Generated {num_formulas} formulas in {n_formulas_chunks} chunks')

    formula_to_id_chunks_prefix = input_db["formula_to_id_chunks"]
    clean_from_cos(config, bucket, formula_to_id_chunks_prefix)
    formulas_bytes = 200 * num_formulas
    formula_to_id_chunk_mb = 512
    N_FORMULA_TO_ID = int(
        math.ceil(formulas_bytes / (formula_to_id_chunk_mb * 1024**2)))

    def store_formula_to_id_chunk(ch_i, ibm_cos):
        print(f'Storing formula_to_id dictionary chunk {ch_i}')
        start_id = (N_FORMULAS_SEGMENTS // N_FORMULA_TO_ID) * ch_i
        end_id = (N_FORMULAS_SEGMENTS // N_FORMULA_TO_ID) * (ch_i + 1)
        keys = [
            f'{formulas_chunks_prefix}/{formulas_chunk}.msgpack'
            for formulas_chunk in range(start_id, end_id)
        ]

        def _get(key):
            formula_chunk = read_object_with_retry(ibm_cos, bucket, key,
                                                   pd.read_msgpack)
            formula_to_id_chunk = dict(
                zip(formula_chunk.formula, formula_chunk.index))
            return formula_to_id_chunk

        with ThreadPoolExecutor(max_workers=128) as pool:
            results = list(pool.map(_get, keys))

        formula_to_id = {}
        for chunk_dict in results:
            formula_to_id.update(chunk_dict)

        ibm_cos.put_object(Bucket=bucket,
                           Key=f'{formula_to_id_chunks_prefix}/{ch_i}.msgpack',
                           Body=msgpack.dumps(formula_to_id))

    pw = pywren.ibm_cf_executor(config=config)
    safe_mb = 512
    memory_capacity_mb = formula_to_id_chunk_mb * 2 + safe_mb
    futures = pw.map(store_formula_to_id_chunk,
                     range(N_FORMULA_TO_ID),
                     runtime_memory=memory_capacity_mb)
    pw.get_result(futures)
    append_pywren_stats(futures,
                        memory=memory_capacity_mb,
                        plus_objects=N_FORMULA_TO_ID)
    logger.info(f'Built {N_FORMULA_TO_ID} formula_to_id dictionaries chunks')

    return num_formulas, n_formulas_chunks
def segment_spectra(pw, ds_chunks_cobjects, ds_segments_bounds,
                    ds_segm_size_mb, ds_segm_dtype):
    ds_segm_n = len(ds_segments_bounds)

    # extend boundaries of the first and last segments
    # to include all mzs outside of the spectra sample mz range
    ds_segments_bounds = ds_segments_bounds.copy()
    ds_segments_bounds[0, 0] = 0
    ds_segments_bounds[-1, 1] = MAX_MZ_VALUE

    # define first level segmentation and then segment each one into desired number
    first_level_segm_size_mb = 512
    first_level_segm_n = (len(ds_segments_bounds) *
                          ds_segm_size_mb) // first_level_segm_size_mb
    first_level_segm_n = max(first_level_segm_n, 1)
    ds_segments_bounds = np.array_split(ds_segments_bounds, first_level_segm_n)

    def segment_spectra_chunk(chunk_cobject, id, storage):
        print(f'Segmenting spectra chunk {id}')
        sp_mz_int_buf = read_cloud_object_with_retry(storage, chunk_cobject,
                                                     msgpack.load)

        def _first_level_segment_upload(segm_i):
            l = ds_segments_bounds[segm_i][0, 0]
            r = ds_segments_bounds[segm_i][-1, 1]
            segm_start, segm_end = np.searchsorted(
                sp_mz_int_buf[:, 1], (l, r))  # mz expected to be in column 1
            segm = sp_mz_int_buf[segm_start:segm_end]
            return storage.put_cobject(msgpack.dumps(segm))

        with ThreadPoolExecutor(max_workers=128) as pool:
            sub_segms_cobjects = list(
                pool.map(_first_level_segment_upload,
                         range(len(ds_segments_bounds))))

        return sub_segms_cobjects

    memory_safe_mb = 1536
    memory_capacity_mb = first_level_segm_size_mb * 2 + memory_safe_mb
    first_futures = pw.map(segment_spectra_chunk,
                           ds_chunks_cobjects,
                           runtime_memory=memory_capacity_mb)
    first_level_segms_cobjects = pw.get_result(first_futures)
    if not isinstance(first_futures, list): first_futures = [first_futures]
    append_pywren_stats(first_futures,
                        memory_mb=memory_capacity_mb,
                        cloud_objects_n=len(first_futures) *
                        len(ds_segments_bounds))

    def merge_spectra_chunk_segments(segm_cobjects, id, storage):
        print(f'Merging segment {id} spectra chunks')

        def _merge(ch_i):
            segm_spectra_chunk = read_cloud_object_with_retry(
                storage, segm_cobjects[ch_i], msgpack.load)
            return segm_spectra_chunk

        with ThreadPoolExecutor(max_workers=128) as pool:
            segm = list(pool.map(_merge, range(len(segm_cobjects))))

        segm = np.concatenate(segm)

        # Alternative in-place sorting (slower) :
        # segm.view(f'{ds_segm_dtype},{ds_segm_dtype},{ds_segm_dtype}').sort(order=['f1'], axis=0)
        segm = segm[segm[:, 1].argsort()]

        bounds_list = ds_segments_bounds[id]

        segms_len = []
        segms_cobjects = []
        for segm_j in range(len(bounds_list)):
            l, r = bounds_list[segm_j]
            segm_start, segm_end = np.searchsorted(
                segm[:, 1], (l, r))  # mz expected to be in column 1
            sub_segm = segm[segm_start:segm_end]
            segms_len.append(len(sub_segm))
            base_id = sum([len(bounds) for bounds in ds_segments_bounds[:id]])
            segm_i = base_id + segm_j
            print(f'Storing dataset segment {segm_i}')
            segms_cobjects.append(storage.put_cobject(msgpack.dumps(sub_segm)))

        return segms_len, segms_cobjects

    second_level_segms_cobjects = np.transpose(
        first_level_segms_cobjects).tolist()
    second_level_segms_cobjects = [
        [segm_cobjects] for segm_cobjects in second_level_segms_cobjects
    ]

    # same memory capacity
    second_futures = pw.map(merge_spectra_chunk_segments,
                            second_level_segms_cobjects,
                            runtime_memory=memory_capacity_mb)
    ds_segms_len, ds_segms_cobjects = list(zip(*pw.get_result(second_futures)))
    ds_segms_len = list(np.concatenate(ds_segms_len))
    ds_segms_cobjects = list(np.concatenate(ds_segms_cobjects))
    append_pywren_stats(second_futures,
                        memory_mb=memory_capacity_mb,
                        cloud_objects_n=ds_segm_n)

    return ds_segms_cobjects, ds_segms_len