def upload_chunk(ch_i, storage):
        chunk_sp_inds = chunks[ch_i]
        # Get imzml_reader from COS because it's too big to include via pywren captured vars
        imzml_reader = pickle.loads(
            read_cloud_object_with_retry(storage, imzml_cobject))
        n_spectra = sum(imzml_reader.mzLengths[sp_i] for sp_i in chunk_sp_inds)
        sp_mz_int_buf = np.zeros((n_spectra, 3),
                                 dtype=imzml_reader.mzPrecision)

        chunk_start = 0
        for sp_i, mzs, ints in get_spectra(ibd_path, imzml_reader,
                                           chunk_sp_inds):
            chunk_end = chunk_start + len(mzs)
            sp_mz_int_buf[chunk_start:chunk_end, 0] = sp_id_to_idx[sp_i]
            sp_mz_int_buf[chunk_start:chunk_end, 1] = mzs
            sp_mz_int_buf[chunk_start:chunk_end, 2] = ints
            chunk_start = chunk_end

        by_mz = np.argsort(sp_mz_int_buf[:, 1])
        sp_mz_int_buf = sp_mz_int_buf[by_mz]
        del by_mz

        chunk = msgpack.dumps(sp_mz_int_buf)
        size = sys.getsizeof(chunk) * (1 / 1024**2)
        logger.info(f'Uploading spectra chunk {ch_i} - %.2f MB' % size)
        chunk_cobject = storage.put_cobject(chunk)
        logger.info(f'Spectra chunk {ch_i} finished')
        return chunk_cobject
def define_ds_segments(pw, ibd_url, imzml_cobject, ds_segm_size_mb, sample_n):
    def get_segm_bounds(storage):
        imzml_reader = pickle.loads(
            read_cloud_object_with_retry(storage, imzml_cobject))
        sp_n = len(imzml_reader.coordinates)
        sample_sp_inds = np.random.choice(np.arange(sp_n), min(sp_n, sample_n))
        print(f'Sampling {len(sample_sp_inds)} spectra')
        spectra_sample = list(
            get_spectra(ibd_url, imzml_reader, sample_sp_inds))

        spectra_mzs = np.concatenate(
            [mzs for sp_id, mzs, ints in spectra_sample])
        print(f'Got {len(spectra_mzs)} mzs')

        total_size = 3 * spectra_mzs.nbytes * sp_n / len(sample_sp_inds)

        segm_n = int(np.ceil(total_size / (ds_segm_size_mb * 2**20)))

        segm_bounds_q = [i * 1 / segm_n for i in range(0, segm_n + 1)]
        segm_lower_bounds = [
            np.quantile(spectra_mzs, q) for q in segm_bounds_q
        ]
        return np.array(
            list(zip(segm_lower_bounds[:-1], segm_lower_bounds[1:])))

    logger.info('Defining dataset segments bounds')
    memory_capacity_mb = 1024
    future = pw.call_async(get_segm_bounds, [],
                           runtime_memory=memory_capacity_mb)
    ds_segments = pw.get_result(future)
    append_pywren_stats(future, memory_mb=memory_capacity_mb)
    return ds_segments
Exemplo n.º 3
0
def log_bad_results(merged_results, missing_results, spatial_wrong, spectral_wrong, chaos_wrong, msm_wrong, fdr_error):
    fdr_any_error = fdr_error[lambda df: df.fdr_error > 0]
    fdr_big_error = fdr_error[lambda df: df.fdr_error > 1]
    results = [
        # Name, Maximum allowed, Actual value, Extra data
        ('Missing annotations', 0, len(missing_results), missing_results.head()),
        # A small number of results are off by up to 1% due to an algorithm change since they were processed
        # Annotations with fewer than 4 ion images now have slightly higher spatial and spectral score than before
        ('Incorrect spatial metric', 2, len(spatial_wrong), spatial_wrong.head()),
        ('Incorrect spectral metric', 5, len(spectral_wrong), spectral_wrong.head()),
        ('Incorrect chaos metric', 0, len(chaos_wrong), chaos_wrong.head()),
        ('Incorrect MSM', 2, len(msm_wrong), msm_wrong.head()),
        # FDR can vary significantly depending on which decoy adducts were chosen.
        ('FDR changed', len(merged_results) * 0.25, len(fdr_any_error), fdr_any_error.head()),
        ('FDR changed significantly', len(merged_results) * 0.1, len(fdr_big_error), fdr_big_error.head()),
    ]
    failed_results = []
    for result_name, threshold, value, data in results:
        if value <= threshold:
            logger.info(f'{result_name}: {value} (PASS)')
        else:
            logger.error(f'{result_name}: {value} (FAIL)')
            failed_results.append((result_name, data))

    for result_name, data in failed_results:
        logger.error(f'{result_name} extra info:\n{str(data)}\n')

    if not failed_results:
        logger.info('All checks pass')
    else:
        logger.error(f'{len(failed_results)} checks failed')
Exemplo n.º 4
0
    def segment_ds(self):
        ds_segments_cache_key = f'{self.cacher.prefix}/segment_ds.cache'

        if self.cacher.exists(ds_segments_cache_key):
            self.ds_segments_bounds, self.ds_segms_cobjects, self.ds_segms_len = \
                self.cacher.load(ds_segments_cache_key)
            logger.info(
                f'Loaded {len(self.ds_segms_cobjects)} dataset segments from cache'
            )
        else:
            sample_sp_n = 1000
            self.ds_segments_bounds = define_ds_segments(
                self.pywren_executor, self.ds_config["ibd_path"],
                self.imzml_cobject, self.ds_segm_size_mb, sample_sp_n)
            self.ds_segms_cobjects, self.ds_segms_len = \
                segment_spectra(self.pywren_executor, self.ds_chunks_cobjects, self.ds_segments_bounds,
                                self.ds_segm_size_mb, self.imzml_reader.mzPrecision)
            logger.info(
                f'Segmented dataset chunks into {len(self.ds_segms_cobjects)} segments'
            )
            self.cacher.save((self.ds_segments_bounds, self.ds_segms_cobjects,
                              self.ds_segms_len), ds_segments_cache_key)

        self.ds_segm_n = len(self.ds_segms_cobjects)
        self.is_intensive_dataset = self.ds_segm_n * self.ds_segm_size_mb > 5000
Exemplo n.º 5
0
    def segment_centroids(self):
        mz_min, mz_max = self.ds_segments_bounds[
            0, 0], self.ds_segments_bounds[-1, 1]
        db_segments_cache_key = f'{self.cacher.prefix}/segment_centroids.cache'

        if self.cacher.exists(db_segments_cache_key):
            self.clip_centr_chunks_cobjects, self.db_segms_cobjects = self.cacher.load(
                db_segments_cache_key)
            logger.info(
                f'Loaded {len(self.db_segms_cobjects)} centroids segments from cache'
            )
        else:
            self.clip_centr_chunks_cobjects, centr_n = \
                clip_centr_df(self.pywren_executor, self.config["storage"]["db_bucket"],
                              self.db_config["centroids_chunks"], mz_min, mz_max)
            centr_segm_lower_bounds = define_centr_segments(
                self.pywren_executor, self.clip_centr_chunks_cobjects, centr_n,
                self.ds_segm_n, self.ds_segm_size_mb)

            max_ds_segms_size_per_db_segm_mb = 2560 if self.is_intensive_dataset else 1536
            self.db_segms_cobjects = segment_centroids(
                self.pywren_executor, self.clip_centr_chunks_cobjects,
                centr_segm_lower_bounds, self.ds_segments_bounds,
                self.ds_segm_size_mb, max_ds_segms_size_per_db_segm_mb,
                self.image_gen_config['ppm'])
            logger.info(
                f'Segmented centroids chunks into {len(self.db_segms_cobjects)} segments'
            )

            self.cacher.save(
                (self.clip_centr_chunks_cobjects, self.db_segms_cobjects),
                db_segments_cache_key)

        self.centr_segm_n = len(self.db_segms_cobjects)
Exemplo n.º 6
0
def calculate_fdrs_vm(storage, formula_scores_df, db_data_cobjects):
    t = time()

    msms_df = formula_scores_df[['msm']]

    def run_fdr(db_data_cobject):
        db, fdr, formula_map_df = read_cloud_object_with_retry(
            storage, db_data_cobject, deserialise)

        formula_msm = formula_map_df.merge(msms_df,
                                           how='inner',
                                           left_on='formula_i',
                                           right_index=True)
        modifiers = fdr.target_modifiers_df[[
            'neutral_loss', 'adduct'
        ]].rename(columns={'neutral_loss': 'modifier'})
        results_df = (fdr.estimate_fdr(formula_msm).assign(
            database_path=db).set_index('formula_i').rename(columns={
                'modifier': 'combined_modifier',
                'formula': 'mol'
            }).merge(modifiers, left_on='combined_modifier',
                     right_index=True).drop(columns=['combined_modifier']))
        return results_df

    logger.info('Estimating FDRs...')
    with ThreadPoolExecutor(os.cpu_count()) as pool:
        results_dfs = list(pool.map(run_fdr, db_data_cobjects))

    exec_time = time() - t
    return pd.concat(results_dfs), exec_time
Exemplo n.º 7
0
    def segment_centroids(self, use_cache=True, debug_validate=False):
        mz_min, mz_max = self.ds_segments_bounds[0, 0], self.ds_segments_bounds[-1, 1]
        cache_key = ':ds/:db/segment_centroids.cache'

        if use_cache and self.cacher.exists(cache_key):
            self.clip_centr_chunks_cobjects, self.db_segms_cobjects = self.cacher.load(cache_key)
            logger.info(f'Loaded {len(self.db_segms_cobjects)} centroids segments from cache')
        else:
            self.clip_centr_chunks_cobjects, centr_n = \
                clip_centr_df(self.lithops_executor, self.peaks_cobjects, mz_min, mz_max)
            centr_segm_lower_bounds = define_centr_segments(self.lithops_executor, self.clip_centr_chunks_cobjects,
                                                            centr_n, self.ds_segm_n, self.ds_segm_size_mb)

            max_ds_segms_size_per_db_segm_mb = 2560 if self.is_intensive_dataset else 1536
            self.db_segms_cobjects = segment_centroids(self.lithops_executor, self.clip_centr_chunks_cobjects,
                                                       centr_segm_lower_bounds, self.ds_segments_bounds,
                                                       self.ds_segm_size_mb, max_ds_segms_size_per_db_segm_mb,
                                                       self.image_gen_config['ppm'])
            logger.info(f'Segmented centroids chunks into {len(self.db_segms_cobjects)} segments')

            self.cacher.save((self.clip_centr_chunks_cobjects, self.db_segms_cobjects), cache_key)

        self.centr_segm_n = len(self.db_segms_cobjects)

        if debug_validate:
            validate_centroid_segments(
                self.lithops_executor, self.db_segms_cobjects, self.ds_segments_bounds,
                self.image_gen_config['ppm']
            )
def download_dataset(imzml_cobject, ibd_cobject, local_path, storage):
    def _download(url_or_cobject, path):
        if isinstance(url_or_cobject, CloudObject):
            stream = storage.get_cloudobject(url_or_cobject, stream=True)
            with path.open('wb') as f:
                copyfileobj(stream, f, 1024 * 1024)
        else:
            with requests.get(url_or_cobject, stream=True) as r:
                r.raise_for_status()
                with path.open('wb') as f:
                    for chunk in r.iter_content():
                        f.write(chunk)

    Path(local_path).mkdir(exist_ok=True)
    imzml_path = local_path / 'ds.imzML'
    ibd_path = local_path / 'ds.ibd'

    logger.info("Download dataset {} - {} ".format(imzml_cobject, imzml_path))
    _download(imzml_cobject, imzml_path)
    logger.info("Download dataset {} - {} ".format(ibd_cobject, ibd_path))
    _download(ibd_cobject, ibd_path)

    imzml_size = imzml_path.stat().st_size / (1024**2)
    ibd_size = ibd_path.stat().st_size / (1024**2)
    logger.debug(f'imzML size: {imzml_size:.2f} mb')
    logger.debug(f'ibd size: {ibd_size:.2f} mb')
    return imzml_path, ibd_path
Exemplo n.º 9
0
def clip_centr_df(pw, peaks_cobjects, mz_min, mz_max):
    def clip_centr_df_chunk(peaks_i, peaks_cobject, storage):
        print(f'Clipping centroids dataframe chunk {peaks_i}')
        centroids_df_chunk = deserialise(
            storage.get_cloudobject(peaks_cobject,
                                    stream=True)).sort_values('mz')
        centroids_df_chunk = centroids_df_chunk[centroids_df_chunk.mz > 0]

        ds_mz_range_unique_formulas = centroids_df_chunk[
            (mz_min < centroids_df_chunk.mz)
            & (centroids_df_chunk.mz < mz_max)].index.unique()
        centr_df_chunk = centroids_df_chunk[centroids_df_chunk.index.isin(
            ds_mz_range_unique_formulas)].reset_index()
        clip_centr_chunk_cobject = storage.put_cloudobject(
            serialise(centr_df_chunk))

        return clip_centr_chunk_cobject, centr_df_chunk.shape[0]

    memory_capacity_mb = 512
    futures = pw.map(clip_centr_df_chunk,
                     list(enumerate(peaks_cobjects)),
                     runtime_memory=memory_capacity_mb)
    clip_centr_chunks_cobjects, centr_n = list(zip(*pw.get_result(futures)))
    PipelineStats.append_func(futures,
                              memory_mb=memory_capacity_mb,
                              cloud_objects_n=len(futures))

    clip_centr_chunks_cobjects = list(clip_centr_chunks_cobjects)
    centr_n = sum(centr_n)
    logger.info(f'Prepared {centr_n} centroids')
    return clip_centr_chunks_cobjects, centr_n
Exemplo n.º 10
0
    def annotate(self):
        logger.info('Annotating...')
        clean_from_cos(self.config, self.config["storage"]["output_bucket"],
                       self.output["formula_images"])

        memory_capacity_mb = 2048  # TODO: Detect when this isn't enough and bump it up to 4096
        process_centr_segment = create_process_segment(
            self.config["storage"]["ds_bucket"],
            self.config["storage"]["output_bucket"],
            self.input_data["ds_segments"], self.ds_segments_bounds,
            self.ds_segms_len, self.coordinates, self.image_gen_config,
            memory_capacity_mb, self.ds_segm_size_mb,
            self.imzml_parser.mzPrecision)

        futures = self.pywren_executor.map(
            process_centr_segment,
            f'{self.config["storage"]["db_bucket"]}/{self.input_db["centroids_segments"]}/',
            runtime_memory=memory_capacity_mb)
        formula_metrics_list, images_cloud_objs = zip(
            *self.pywren_executor.get_result(futures))
        self.formula_metrics_df = pd.concat(formula_metrics_list)
        self.images_cloud_objs = list(chain(*images_cloud_objs))
        append_pywren_stats(futures,
                            memory=memory_capacity_mb,
                            plus_objects=len(self.images_cloud_objs))

        logger.info(f'Metrics calculated: {self.formula_metrics_df.shape[0]}')
def define_centr_segments(pw, clip_centr_chunks_cobjects, centr_n, ds_segm_n,
                          ds_segm_size_mb):
    logger.info('Defining centroids segments bounds')

    def get_first_peak_mz(cobject, id, storage):
        print(
            f'Extracting first peak mz values from clipped centroids dataframe {id}'
        )
        centr_df = read_cloud_object_with_retry(storage, cobject,
                                                pd.read_msgpack)
        first_peak_df = centr_df[centr_df.peak_i == 0]
        return first_peak_df.mz.values

    memory_capacity_mb = 512
    futures = pw.map(get_first_peak_mz,
                     clip_centr_chunks_cobjects,
                     runtime_memory=memory_capacity_mb)
    first_peak_df_mz = np.concatenate(pw.get_result(futures))
    append_pywren_stats(futures, memory_mb=memory_capacity_mb)

    ds_size_mb = ds_segm_n * ds_segm_size_mb
    data_per_centr_segm_mb = 50
    peaks_per_centr_segm = 1e4
    centr_segm_n = int(
        max(ds_size_mb // data_per_centr_segm_mb,
            centr_n // peaks_per_centr_segm, 32))

    segm_bounds_q = [i * 1 / centr_segm_n for i in range(0, centr_segm_n)]
    centr_segm_lower_bounds = np.quantile(first_peak_df_mz, segm_bounds_q)

    logger.info(
        f'Generated {len(centr_segm_lower_bounds)} centroids bounds: {centr_segm_lower_bounds[0]}...{centr_segm_lower_bounds[-1]}'
    )
    return centr_segm_lower_bounds
def clip_centr_df(pw, bucket, centr_chunks_prefix, mz_min, mz_max):
    def clip_centr_df_chunk(obj, storage):
        print(f'Clipping centroids dataframe chunk {obj.key}')
        centroids_df_chunk = pd.read_msgpack(
            obj.data_stream._raw_stream).sort_values('mz')
        centroids_df_chunk = centroids_df_chunk[centroids_df_chunk.mz > 0]

        ds_mz_range_unique_formulas = centroids_df_chunk[
            (mz_min < centroids_df_chunk.mz)
            & (centroids_df_chunk.mz < mz_max)].index.unique()
        centr_df_chunk = centroids_df_chunk[centroids_df_chunk.index.isin(
            ds_mz_range_unique_formulas)].reset_index()
        clip_centr_chunk_cobject = storage.put_cobject(
            centr_df_chunk.to_msgpack())

        return clip_centr_chunk_cobject, centr_df_chunk.shape[0]

    memory_capacity_mb = 512
    futures = pw.map(clip_centr_df_chunk,
                     f'cos://{bucket}/{centr_chunks_prefix}/',
                     runtime_memory=memory_capacity_mb)
    clip_centr_chunks_cobjects, centr_n = list(zip(*pw.get_result(futures)))
    append_pywren_stats(futures,
                        memory_mb=memory_capacity_mb,
                        cloud_objects_n=len(futures))

    clip_centr_chunks_cobjects = list(clip_centr_chunks_cobjects)
    centr_n = sum(centr_n)
    logger.info(f'Prepared {centr_n} centroids')
    return clip_centr_chunks_cobjects, centr_n
Exemplo n.º 13
0
def calculate_centroids(pw, formula_cobjects, ds_config):
    polarity = ds_config['polarity']
    isocalc_sigma = ds_config['isocalc_sigma']

    def calculate_peaks_for_formula(formula_i, formula):
        mzs, ints = isocalc_wrapper.centroids(formula)
        if mzs is not None:
            return list(zip(repeat(formula_i), range(len(mzs)), mzs, ints))
        else:
            return []

    def calculate_peaks_chunk(segm_i, segm_cobject, storage):
        print(f'Calculating peaks from formulas chunk {segm_i}')
        chunk_df = deserialise(
            storage.get_cloudobject(segm_cobject, stream=True))
        peaks = [
            peak for formula_i, formula in chunk_df.items()
            for peak in calculate_peaks_for_formula(formula_i, formula)
        ]
        peaks_df = pd.DataFrame(peaks,
                                columns=['formula_i', 'peak_i', 'mz', 'int'])
        peaks_df.set_index('formula_i', inplace=True)

        print(f'Storing centroids chunk {id}')
        peaks_cobject = storage.put_cloudobject(serialise(peaks_df))

        return peaks_cobject, peaks_df.shape[0]

    from annotation_pipeline.isocalc_wrapper import IsocalcWrapper  # Import lazily so that the rest of the pipeline still works if the dependency is missing
    isocalc_wrapper = IsocalcWrapper({
        # These instrument settings are usually customized on a per-dataset basis out of a set of
        # 18 possible combinations, but most of EMBL's datasets are compatible with the following settings:
        'charge': {
            'polarity': polarity,
            'n_charges': 1,
        },
        'isocalc_sigma':
        float(f"{isocalc_sigma:f}"
              )  # Rounding to match production implementation
    })

    memory_capacity_mb = 2048
    futures = pw.map(calculate_peaks_chunk,
                     list(enumerate(formula_cobjects)),
                     runtime_memory=memory_capacity_mb)
    results = pw.get_result(futures)
    PipelineStats.append_func(futures,
                              memory_mb=memory_capacity_mb,
                              cloud_objects_n=len(futures))

    num_centroids = sum(count for cobj, count in results)
    n_centroids_chunks = len(results)
    peaks_cobjects = [cobj for cobj, count in results]
    logger.info(
        f'Calculated {num_centroids} centroids in {n_centroids_chunks} chunks')
    return peaks_cobjects
Exemplo n.º 14
0
    def upload_molecular_databases(self, use_cache=True):
        cache_key = ':db/upload_molecular_databases.cache'

        if use_cache and self.cacher.exists(cache_key):
            self.mols_dbs_cobjects = self.cacher.load(cache_key)
            logger.info(f'Loaded {len(self.mols_dbs_cobjects)} molecular databases from cache')
        else:
            self.mols_dbs_cobjects = upload_mol_dbs_from_dir(self.storage, self.db_config['databases'])
            logger.info(f'Uploaded {len(self.mols_dbs_cobjects)} molecular databases')
            self.cacher.save(self.mols_dbs_cobjects, cache_key)
Exemplo n.º 15
0
 def _upload_chunk(ch_i, sp_mz_int_buf):
     chunk = msgpack.dumps(sp_mz_int_buf)
     key = f'{input_data["ds_chunks"]}/{ch_i}.msgpack'
     size = sys.getsizeof(chunk) * (1 / 1024 ** 2)
     logger.info(f'Uploading spectra chunk {ch_i} - %.2f MB' % size)
     cos_client.put_object(Bucket=config["storage"]["ds_bucket"],
                           Key=key,
                           Body=chunk)
     logger.info(f'Spectra chunk {ch_i} finished')
     return key
Exemplo n.º 16
0
    def run_fdr(self):
        self.rankings_df = build_fdr_rankings(
            self.pywren_executor, self.config["storage"]["db_bucket"],
            self.input_data, self.input_db, self.formula_metrics_df)
        self.fdrs = calculate_fdrs(self.pywren_executor,
                                   self.config['storage']['ds_bucket'],
                                   self.rankings_df)

        logger.info(f'Number of annotations at with FDR less than:')
        for fdr_step in [0.05, 0.1, 0.2, 0.5]:
            logger.info(
                f'{fdr_step*100:2.0f}%: {(self.fdrs.fdr < fdr_step).sum()}')
Exemplo n.º 17
0
    def load_ds(self, use_cache=True):
        cache_key = ':ds/load_ds.cache'

        if self.hybrid_impl:
            pass  # all work is done in segment_ds
        else:
            if use_cache and self.cacher.exists(cache_key):
                self.imzml_reader, self.imzml_reader_cobject = self.cacher.load(cache_key)
                logger.info(f'Loaded imzml from cache, {len(self.imzml_reader.coordinates)} spectra found')
            else:
                self.imzml_reader, self.imzml_reader_cobject = get_imzml_reader(self.lithops_executor, self.imzml_cobject)
                logger.info(f'Parsed imzml: {len(self.imzml_reader.coordinates)} spectra found')
                self.cacher.save((self.imzml_reader, self.imzml_reader_cobject), cache_key)
Exemplo n.º 18
0
    def segment_ds(self, use_cache=True, debug_validate=False):
        cache_key = ':ds/segment_ds.cache'

        if self.hybrid_impl:
            if use_cache and self.cacher.exists(cache_key):
                result = self.cacher.load(cache_key)
                logger.info(f'Loaded {len(result[2])} dataset segments from cache')
            else:
                sort_memory = 2**32
                fs = self.lithops_vm_executor.call_async(
                    load_and_split_ds_vm,
                    (self.imzml_cobject, self.ibd_cobject, self.ds_segm_size_mb, sort_memory),
                )
                result = self.lithops_vm_executor.get_result(fs)

                logger.info(f'Segmented dataset chunks into {len(result[2])} segments')
                self.cacher.save(result, cache_key)
            self.imzml_reader, \
            self.ds_segments_bounds, \
            self.ds_segms_cobjects, \
            self.ds_segms_len, \
            ds_segm_stats = result
            for func_name, exec_time in ds_segm_stats:
                if func_name == 'upload_segments':
                    cobjs_n = len(self.ds_segms_cobjects)
                else:
                    cobjs_n = 0
                PipelineStats.append_vm(func_name, exec_time, cloud_objects_n=cobjs_n)
        else:
            if use_cache and self.cacher.exists(cache_key):
                self.ds_segments_bounds, self.ds_segms_cobjects, self.ds_segms_len = \
                    self.cacher.load(cache_key)
                logger.info(f'Loaded {len(self.ds_segms_cobjects)} dataset segments from cache')
            else:
                sample_sp_n = 1000
                self.ds_segments_bounds = define_ds_segments(
                    self.lithops_executor,
                    self.ibd_cobject,
                    self.imzml_reader_cobject,
                    self.ds_segm_size_mb,
                    sample_sp_n,
                )
                self.ds_segms_cobjects, self.ds_segms_len = segment_spectra(
                    self.lithops_executor,
                    self.ds_chunks_cobjects,
                    self.ds_segments_bounds,
                    self.ds_segm_size_mb,
                    self.imzml_reader.mzPrecision,
                )
                logger.info(f'Segmented dataset chunks into {len(self.ds_segms_cobjects)} segments')
                self.cacher.save((self.ds_segments_bounds, self.ds_segms_cobjects, self.ds_segms_len), cache_key)

        self.ds_segm_n = len(self.ds_segms_cobjects)
        self.is_intensive_dataset = self.ds_segm_n * self.ds_segm_size_mb > 5000

        if debug_validate:
            validate_ds_segments(
                self.lithops_executor, self.imzml_reader, self.ds_segments_bounds,
                self.ds_segms_cobjects, self.ds_segms_len, self.hybrid_impl,
            )
Exemplo n.º 19
0
    def split_ds(self, use_cache=True):
        cache_key = ':ds/split_ds.cache'

        if self.hybrid_impl:
            pass  # all work is done in segment_ds
        else:
            if use_cache and self.cacher.exists(cache_key):
                self.ds_chunks_cobjects = self.cacher.load(cache_key)
                logger.info(f'Loaded {len(self.ds_chunks_cobjects)} dataset chunks from cache')
            else:
                self.ds_chunks_cobjects = chunk_spectra(self.lithops_executor, self.ibd_cobject,
                                                        self.imzml_reader_cobject, self.imzml_reader)
                logger.info(f'Uploaded {len(self.ds_chunks_cobjects)} dataset chunks')
                self.cacher.save(self.ds_chunks_cobjects, cache_key)
Exemplo n.º 20
0
    def build_database(self, use_cache=True, debug_validate=False):
        if self.hybrid_impl:
            cache_key = ':ds/:db/build_database.cache'
            if use_cache and self.cacher.exists(cache_key):
                self.formula_cobjects, self.db_data_cobjects = self.cacher.load(cache_key)
                logger.info(f'Loaded {len(self.formula_cobjects)} formula segments and'
                            f' {len(self.db_data_cobjects)} db_data objects from cache')
            else:
                futures = self.lithops_vm_executor.call_async(
                    build_database_local,
                    (self.db_config, self.ds_config, self.mols_dbs_cobjects)
                )
                self.formula_cobjects, self.db_data_cobjects, build_db_exec_time = self.lithops_vm_executor.get_result(futures)
                PipelineStats.append_vm('build_database', build_db_exec_time,
                                        cloud_objects_n=len(self.formula_cobjects))
                logger.info(f'Built {len(self.formula_cobjects)} formula segments and'
                            f' {len(self.db_data_cobjects)} db_data objects')
                self.cacher.save((self.formula_cobjects, self.db_data_cobjects), cache_key)
        else:
            cache_key = ':db/build_database.cache'
            if use_cache and self.cacher.exists(cache_key):
                self.formula_cobjects, self.formula_to_id_cobjects = self.cacher.load(cache_key)
                logger.info(f'Loaded {len(self.formula_cobjects)} formula segments and'
                            f' {len(self.formula_to_id_cobjects)} formula-to-id chunks from cache')
            else:
                self.formula_cobjects, self.formula_to_id_cobjects = build_database(
                    self.lithops_executor, self.db_config, self.mols_dbs_cobjects
                )
                logger.info(f'Built {len(self.formula_cobjects)} formula segments and'
                            f' {len(self.formula_to_id_cobjects)} formula-to-id chunks')
                self.cacher.save((self.formula_cobjects, self.formula_to_id_cobjects), cache_key)

        if debug_validate:
            validate_formula_cobjects(self.storage, self.formula_cobjects)
Exemplo n.º 21
0
 def segment_ds(self):
     clean_from_cos(self.config, self.config["storage"]["ds_bucket"],
                    self.input_data["ds_segments"])
     sample_sp_n = 1000
     self.ds_segments_bounds = define_ds_segments(self.imzml_parser,
                                                  self.ds_segm_size_mb,
                                                  sample_ratio=sample_sp_n /
                                                  self.sp_n)
     self.ds_segm_n, self.ds_segms_len = segment_spectra(
         self.pywren_executor, self.config["storage"]["ds_bucket"],
         self.input_data["ds_chunks"], self.input_data["ds_segments"],
         self.ds_segments_bounds, self.ds_segm_size_mb,
         self.imzml_parser.mzPrecision)
     logger.info(f'Segmented dataset chunks into {self.ds_segm_n} segments')
Exemplo n.º 22
0
    def calculate_centroids(self, use_cache=True, debug_validate=False):
        cache_key = ':ds/:db/calculate_centroids.cache'

        if use_cache and self.cacher.exists(cache_key):
            self.peaks_cobjects = self.cacher.load(cache_key)
            logger.info(f'Loaded {len(self.peaks_cobjects)} centroid chunks from cache')
        else:
            self.peaks_cobjects = calculate_centroids(
                self.lithops_executor, self.formula_cobjects, self.ds_config
            )
            logger.info(f'Calculated {len(self.peaks_cobjects)} centroid chunks')
            self.cacher.save(self.peaks_cobjects, cache_key)

        if debug_validate:
            validate_peaks_cobjects(self.lithops_executor, self.peaks_cobjects)
def calculate_centroids(config, input_db, polarity='+', isocalc_sigma=0.001238):
    bucket = config["storage"]["db_bucket"]
    formulas_chunks_prefix = input_db["formulas_chunks"]
    centroids_chunks_prefix = input_db["centroids_chunks"]
    clean_from_cos(config, bucket, centroids_chunks_prefix)

    def calculate_peaks_for_formula(formula_i, formula):
        mzs, ints = isocalc_wrapper.centroids(formula)
        if mzs is not None:
            return list(zip(repeat(formula_i), range(len(mzs)), mzs, ints))
        else:
            return []

    def calculate_peaks_chunk(obj, id, storage):
        print(f'Calculating peaks from formulas chunk {obj.key}')
        chunk_df = pd.read_msgpack(obj.data_stream._raw_stream)
        peaks = [peak for formula_i, formula in chunk_df.formula.items()
                 for peak in calculate_peaks_for_formula(formula_i, formula)]
        peaks_df = pd.DataFrame(peaks, columns=['formula_i', 'peak_i', 'mz', 'int'])
        peaks_df.set_index('formula_i', inplace=True)

        print(f'Storing centroids chunk {id}')
        centroids_chunk_key = f'{centroids_chunks_prefix}/{id}.msgpack'
        storage.put_object(Bucket=bucket, Key=centroids_chunk_key, Body=peaks_df.to_msgpack())

        return peaks_df.shape[0]

    from annotation_pipeline.isocalc_wrapper import IsocalcWrapper # Import lazily so that the rest of the pipeline still works if the dependency is missing
    isocalc_wrapper = IsocalcWrapper({
        # These instrument settings are usually customized on a per-dataset basis out of a set of
        # 18 possible combinations, but most of EMBL's datasets are compatible with the following settings:
        'charge': {
            'polarity': polarity,
            'n_charges': 1,
        },
        'isocalc_sigma': float(f"{isocalc_sigma:f}") # Rounding to match production implementation
    })

    pw = pywren.ibm_cf_executor(config=config)
    memory_capacity_mb = 2048
    futures = pw.map(calculate_peaks_chunk, f'cos://{bucket}/{formulas_chunks_prefix}/', runtime_memory=memory_capacity_mb)
    centroids_chunks_n = pw.get_result(futures)
    append_pywren_stats(futures, memory_mb=memory_capacity_mb, cloud_objects_n=len(futures))

    num_centroids = sum(centroids_chunks_n)
    n_centroids_chunks = len(centroids_chunks_n)
    logger.info(f'Calculated {num_centroids} centroids in {n_centroids_chunks} chunks')
    return num_centroids, n_centroids_chunks
Exemplo n.º 24
0
    def split_ds(self):
        ds_chunks_cache_key = f'{self.cacher.prefix}/split_ds.cache'

        if self.cacher.exists(ds_chunks_cache_key):
            self.ds_chunks_cobjects = self.cacher.load(ds_chunks_cache_key)
            logger.info(
                f'Loaded {len(self.ds_chunks_cobjects)} dataset chunks from cache'
            )
        else:
            self.ds_chunks_cobjects = chunk_spectra(self.pywren_executor,
                                                    self.ds_config['ibd_path'],
                                                    self.imzml_cobject,
                                                    self.imzml_reader)
            logger.info(
                f'Uploaded {len(self.ds_chunks_cobjects)} dataset chunks')
            self.cacher.save(self.ds_chunks_cobjects, ds_chunks_cache_key)
Exemplo n.º 25
0
    def __init__(self, ds_config, db_config, use_db_cache=True, use_ds_cache=True, hybrid_impl='auto'):

        self.config = default_config()
        self.ds_config = ds_config
        self.db_config = db_config
        self.use_db_cache = use_db_cache
        self.use_ds_cache = use_ds_cache
        if hybrid_impl == 'auto':
            self.hybrid_impl = (
                self.config['lithops']['mode'] == 'localhost'
                or self.config['lithops']['mode'] == 'serverless' and 'ibm_vpc' in self.config
            )
            if self.hybrid_impl:
                logger.info(f'Using the Hybrid implementation')
            else:
                logger.info(f'Using the pure Serverless implementation')
        else:
            self.hybrid_impl = hybrid_impl

        lithops_bucket = self.config['lithops']['storage_bucket']
        self.ds_bucket = self.config.get('storage', {}).get('ds_bucket', lithops_bucket)

        self.lithops_executor = lithops.FunctionExecutor(config=self.config, runtime_memory=2048)
        if self.hybrid_impl:
            if self.config['lithops']['mode'] == 'localhost':
                self.lithops_vm_executor = self.lithops_executor
            else:
                self.lithops_vm_executor = lithops.StandaloneExecutor(config=self.config)

        self.storage = Storage(config=self.config)

        cache_namespace = 'vm' if hybrid_impl else 'function'
        self.cacher = PipelineCacher(
            self.storage, lithops_bucket, cache_namespace, self.ds_config["name"], self.db_config["name"]
        )
        if not self.use_db_cache or not self.use_ds_cache:
            self.cacher.clean(database=not self.use_db_cache, dataset=not self.use_ds_cache)

        stats_path_cache_key = ':ds/:db/stats_path.cache'
        if self.cacher.exists(stats_path_cache_key):
            self.stats_path = self.cacher.load(stats_path_cache_key)
            PipelineStats.path = self.stats_path
            logger.info(f'Using cached {self.stats_path} for statistics')
        else:
            PipelineStats.init()
            self.stats_path = PipelineStats.path
            self.cacher.save(self.stats_path, stats_path_cache_key)
            logger.info(f'Initialised {self.stats_path} for statistics')

        self.ds_segm_size_mb = 128
        self.image_gen_config = {
            "q": 99,
            "do_preprocessing": False,
            "nlevels": 30,
            "ppm": 3.0
        }
Exemplo n.º 26
0
    def load_ds(self):
        imzml_cache_key = f'{self.cacher.prefix}/load_ds.cache'

        if self.cacher.exists(imzml_cache_key):
            self.imzml_reader, self.imzml_cobject = self.cacher.load(
                imzml_cache_key)
            logger.info(
                f'Loaded imzml from cache, {len(self.imzml_reader.coordinates)} spectra found'
            )
        else:
            self.imzml_reader, self.imzml_cobject = get_imzml_reader(
                self.pywren_executor, self.ds_config['imzml_path'])
            logger.info(
                f'Parsed imzml: {len(self.imzml_reader.coordinates)} spectra found'
            )
            self.cacher.save((self.imzml_reader, self.imzml_cobject),
                             imzml_cache_key)
Exemplo n.º 27
0
    def save_results(self, out_dir='.'):
        out_dir = Path(out_dir)
        images_dir = out_dir / 'images'
        images_dir.mkdir(parents=True, exist_ok=True)

        results_df = self.get_results()
        results_df.to_csv(out_dir / 'results.csv')
        image_sets = self.get_images(True, True)
        __import__('__main__').image_sets = image_sets

        filenames = (results_df.full_mol + '.png').to_dict()
        n_saved_images = 0
        for formula_i, image_set in image_sets.items():
            if image_set[0] is not None and formula_i in filenames:
                (images_dir / filenames[formula_i]).open('wb').write(image_set[0])
                n_saved_images += 1

        logger.info(f'Saved results.csv and {n_saved_images} images to {out_dir.resolve()}')
Exemplo n.º 28
0
def define_ds_segments(imzml_parser, ds_segm_size_mb=5, sample_ratio=0.05):
    logger.info('Defining dataset segments bounds')
    spectra_sample = list(spectra_sample_gen(imzml_parser, sample_ratio=sample_ratio))

    spectra_mzs = np.array([mz for sp_id, mzs, ints in spectra_sample for mz in mzs])
    total_n_mz = spectra_mzs.shape[0] / sample_ratio

    float_prec = 4 if imzml_parser.mzPrecision == 'f' else 8
    segm_arr_columns = 3
    segm_n = segm_arr_columns * (total_n_mz * float_prec) // (ds_segm_size_mb * 2 ** 20)
    segm_n = max(1, int(segm_n))

    segm_bounds_q = [i * 1 / segm_n for i in range(0, segm_n + 1)]
    segm_lower_bounds = [np.quantile(spectra_mzs, q) for q in segm_bounds_q]
    ds_segments = np.array(list(zip(segm_lower_bounds[:-1], segm_lower_bounds[1:])))

    logger.info(f'Generated {len(ds_segments)} dataset bounds: {ds_segments[0]}...{ds_segments[-1]}')
    return ds_segments
Exemplo n.º 29
0
    def clean(self, database=True, dataset=True, hard=False):
        unique_prefixes = []
        if not hard:
            if database:
                unique_prefixes.append(self.prefixes[':db'])
            if dataset:
                unique_prefixes.append(self.prefixes[':ds'])
            if database or dataset:
                unique_prefixes.append(self.prefixes[':ds/:db'])
        else:
            unique_prefixes.append(self.prefixes[''])

        keys = [
            key for prefix in unique_prefixes
            for key in self.storage.list_keys(self.bucket, prefix)
        ]

        cobjects_to_clean = []
        for cache_key in keys:
            cache_data = read_object_with_retry(self.storage, self.bucket,
                                                cache_key, deserialise)

            if isinstance(cache_data, tuple):
                for obj in cache_data:
                    if isinstance(obj, list):
                        if isinstance(obj[0], CloudObject):
                            cobjects_to_clean.extend(obj)
                    elif isinstance(obj, CloudObject):
                        cobjects_to_clean.append(obj)
            elif isinstance(cache_data, list):
                if isinstance(cache_data[0], CloudObject):
                    cobjects_to_clean.extend(cache_data)
            elif isinstance(cache_data, CloudObject):
                cobjects_to_clean.append(cache_data)

        self.storage.delete_cloudobjects(cobjects_to_clean)
        for prefix in unique_prefixes:
            keys = self.storage.list_keys(self.bucket, prefix)
            if keys:
                self.storage.delete_objects(self.bucket, keys)
                logger.info(
                    f'Removed {len(keys)} objects from {self.storage.backend}://{self.bucket}/{prefix}'
                )
Exemplo n.º 30
0
    def run_fdr(self, use_cache=True):
        cache_key = ':ds/:db/run_fdr.cache'

        if use_cache and self.cacher.exists(cache_key):
            self.fdrs = self.cacher.load(cache_key)
            logger.info('Loaded fdrs from cache')
        else:
            if self.hybrid_impl:
                futures = self.lithops_vm_executor.call_async(
                    calculate_fdrs_vm,
                    (self.formula_metrics_df, self.db_data_cobjects),
                )
                self.fdrs, fdr_exec_time = self.lithops_vm_executor.get_result(futures)

                PipelineStats.append_vm('calculate_fdrs', fdr_exec_time)
            else:
                rankings_df = build_fdr_rankings(
                    self.lithops_executor, self.ds_config, self.db_config, self.mols_dbs_cobjects,
                    self.formula_to_id_cobjects, self.formula_metrics_df
                )
                self.fdrs = calculate_fdrs(self.lithops_executor, rankings_df)
            self.cacher.save(self.fdrs, cache_key)

        logger.info('Number of annotations at with FDR less than:')
        for fdr_step in [0.05, 0.1, 0.2, 0.5]:
            logger.info(f'{fdr_step*100:2.0f}%: {(self.fdrs.fdr < fdr_step).sum()}')