def define_centr_segments(pw, clip_centr_chunks_cobjects, centr_n, ds_segm_n, ds_segm_size_mb): logger.info('Defining centroids segments bounds') def get_first_peak_mz(cobject, id, storage): print( f'Extracting first peak mz values from clipped centroids dataframe {id}' ) centr_df = read_cloud_object_with_retry(storage, cobject, pd.read_msgpack) first_peak_df = centr_df[centr_df.peak_i == 0] return first_peak_df.mz.values memory_capacity_mb = 512 futures = pw.map(get_first_peak_mz, clip_centr_chunks_cobjects, runtime_memory=memory_capacity_mb) first_peak_df_mz = np.concatenate(pw.get_result(futures)) append_pywren_stats(futures, memory_mb=memory_capacity_mb) ds_size_mb = ds_segm_n * ds_segm_size_mb data_per_centr_segm_mb = 50 peaks_per_centr_segm = 1e4 centr_segm_n = int( max(ds_size_mb // data_per_centr_segm_mb, centr_n // peaks_per_centr_segm, 32)) segm_bounds_q = [i * 1 / centr_segm_n for i in range(0, centr_segm_n)] centr_segm_lower_bounds = np.quantile(first_peak_df_mz, segm_bounds_q) logger.info( f'Generated {len(centr_segm_lower_bounds)} centroids bounds: {centr_segm_lower_bounds[0]}...{centr_segm_lower_bounds[-1]}' ) return centr_segm_lower_bounds
def define_ds_segments(pw, ibd_url, imzml_cobject, ds_segm_size_mb, sample_n): def get_segm_bounds(storage): imzml_reader = pickle.loads( read_cloud_object_with_retry(storage, imzml_cobject)) sp_n = len(imzml_reader.coordinates) sample_sp_inds = np.random.choice(np.arange(sp_n), min(sp_n, sample_n)) print(f'Sampling {len(sample_sp_inds)} spectra') spectra_sample = list( get_spectra(ibd_url, imzml_reader, sample_sp_inds)) spectra_mzs = np.concatenate( [mzs for sp_id, mzs, ints in spectra_sample]) print(f'Got {len(spectra_mzs)} mzs') total_size = 3 * spectra_mzs.nbytes * sp_n / len(sample_sp_inds) segm_n = int(np.ceil(total_size / (ds_segm_size_mb * 2**20))) segm_bounds_q = [i * 1 / segm_n for i in range(0, segm_n + 1)] segm_lower_bounds = [ np.quantile(spectra_mzs, q) for q in segm_bounds_q ] return np.array( list(zip(segm_lower_bounds[:-1], segm_lower_bounds[1:]))) logger.info('Defining dataset segments bounds') memory_capacity_mb = 1024 future = pw.call_async(get_segm_bounds, [], runtime_memory=memory_capacity_mb) ds_segments = pw.get_result(future) append_pywren_stats(future, memory_mb=memory_capacity_mb) return ds_segments
def clip_centr_df(pw, bucket, centr_chunks_prefix, mz_min, mz_max): def clip_centr_df_chunk(obj, storage): print(f'Clipping centroids dataframe chunk {obj.key}') centroids_df_chunk = pd.read_msgpack( obj.data_stream._raw_stream).sort_values('mz') centroids_df_chunk = centroids_df_chunk[centroids_df_chunk.mz > 0] ds_mz_range_unique_formulas = centroids_df_chunk[ (mz_min < centroids_df_chunk.mz) & (centroids_df_chunk.mz < mz_max)].index.unique() centr_df_chunk = centroids_df_chunk[centroids_df_chunk.index.isin( ds_mz_range_unique_formulas)].reset_index() clip_centr_chunk_cobject = storage.put_cobject( centr_df_chunk.to_msgpack()) return clip_centr_chunk_cobject, centr_df_chunk.shape[0] memory_capacity_mb = 512 futures = pw.map(clip_centr_df_chunk, f'cos://{bucket}/{centr_chunks_prefix}/', runtime_memory=memory_capacity_mb) clip_centr_chunks_cobjects, centr_n = list(zip(*pw.get_result(futures))) append_pywren_stats(futures, memory_mb=memory_capacity_mb, cloud_objects_n=len(futures)) clip_centr_chunks_cobjects = list(clip_centr_chunks_cobjects) centr_n = sum(centr_n) logger.info(f'Prepared {centr_n} centroids') return clip_centr_chunks_cobjects, centr_n
def annotate(self): annotations_cache_key = f'{self.cacher.prefix}/annotate.cache' if self.cacher.exists(annotations_cache_key): self.formula_metrics_df, self.images_cloud_objs = self.cacher.load( annotations_cache_key) logger.info( f'Loaded {self.formula_metrics_df.shape[0]} metrics from cache' ) else: logger.info('Annotating...') memory_capacity_mb = 4096 if self.is_intensive_dataset else 2048 process_centr_segment = create_process_segment( self.ds_segms_cobjects, self.ds_segments_bounds, self.ds_segms_len, self.imzml_reader, self.image_gen_config, memory_capacity_mb, self.ds_segm_size_mb) futures = self.pywren_executor.map( process_centr_segment, self.db_segms_cobjects, runtime_memory=memory_capacity_mb) formula_metrics_list, images_cloud_objs = zip( *self.pywren_executor.get_result(futures)) self.formula_metrics_df = pd.concat(formula_metrics_list) self.images_cloud_objs = list(chain(*images_cloud_objs)) append_pywren_stats(futures, memory_mb=memory_capacity_mb, cloud_objects_n=len(self.images_cloud_objs)) logger.info( f'Metrics calculated: {self.formula_metrics_df.shape[0]}') self.cacher.save((self.formula_metrics_df, self.images_cloud_objs), annotations_cache_key)
def annotate(self): logger.info('Annotating...') clean_from_cos(self.config, self.config["storage"]["output_bucket"], self.output["formula_images"]) memory_capacity_mb = 2048 # TODO: Detect when this isn't enough and bump it up to 4096 process_centr_segment = create_process_segment( self.config["storage"]["ds_bucket"], self.config["storage"]["output_bucket"], self.input_data["ds_segments"], self.ds_segments_bounds, self.ds_segms_len, self.coordinates, self.image_gen_config, memory_capacity_mb, self.ds_segm_size_mb, self.imzml_parser.mzPrecision) futures = self.pywren_executor.map( process_centr_segment, f'{self.config["storage"]["db_bucket"]}/{self.input_db["centroids_segments"]}/', runtime_memory=memory_capacity_mb) formula_metrics_list, images_cloud_objs = zip( *self.pywren_executor.get_result(futures)) self.formula_metrics_df = pd.concat(formula_metrics_list) self.images_cloud_objs = list(chain(*images_cloud_objs)) append_pywren_stats(futures, memory=memory_capacity_mb, plus_objects=len(self.images_cloud_objs)) logger.info(f'Metrics calculated: {self.formula_metrics_df.shape[0]}')
def calculate_fdrs(pw, rankings_df): def run_ranking(target_cobject, decoy_cobject, storage): target = pickle.loads( read_cloud_object_with_retry(storage, target_cobject)) decoy = pickle.loads( read_cloud_object_with_retry(storage, decoy_cobject)) merged = pd.concat( [target.assign(is_target=1), decoy.assign(is_target=0)], sort=False) merged = merged.sort_values('msm', ascending=False) decoy_cumsum = (merged.is_target == False).cumsum() target_cumsum = merged.is_target.cumsum() base_fdr = np.clip(decoy_cumsum / target_cumsum, 0, 1) base_fdr[np.isnan(base_fdr)] = 1 target_fdrs = merged.assign(fdr=base_fdr)[lambda df: df.is_target == 1] target_fdrs = target_fdrs.drop('is_target', axis=1) target_fdrs = target_fdrs.sort_values('msm') target_fdrs = target_fdrs.assign( fdr=np.minimum.accumulate(target_fdrs.fdr)) target_fdrs = target_fdrs.sort_index() return target_fdrs def merge_rankings(target_row, decoy_cobjects, storage): print("Merging rankings...") print(target_row) rankings = [ run_ranking(target_row.cobject, decoy_cobject, storage) for decoy_cobject in decoy_cobjects ] mols = (pd.concat(rankings).rename_axis( 'formula_i').reset_index().groupby('formula_i').agg({ 'fdr': np.nanmedian, 'mol': 'first' }).assign(database_path=target_row.database_path, adduct=target_row.adduct, modifier=target_row.modifier)) return mols ranking_jobs = [] for group_i, group in rankings_df.groupby('group_i'): target_rows = group[group.is_target] decoy_rows = group[~group.is_target] for i, target_row in target_rows.iterrows(): ranking_jobs.append([target_row, decoy_rows.cobject.tolist()]) memory_capacity_mb = 256 futures = pw.map(merge_rankings, ranking_jobs, runtime_memory=memory_capacity_mb) results = pw.get_result(futures) append_pywren_stats(futures, memory_mb=memory_capacity_mb) return pd.concat(results)
def calculate_centroids(config, input_db, polarity='+', isocalc_sigma=0.001238): bucket = config["storage"]["db_bucket"] formulas_chunks_prefix = input_db["formulas_chunks"] centroids_chunks_prefix = input_db["centroids_chunks"] clean_from_cos(config, bucket, centroids_chunks_prefix) def calculate_peaks_for_formula(formula_i, formula): mzs, ints = isocalc_wrapper.centroids(formula) if mzs is not None: return list(zip(repeat(formula_i), range(len(mzs)), mzs, ints)) else: return [] def calculate_peaks_chunk(obj, id, storage): print(f'Calculating peaks from formulas chunk {obj.key}') chunk_df = pd.read_msgpack(obj.data_stream._raw_stream) peaks = [peak for formula_i, formula in chunk_df.formula.items() for peak in calculate_peaks_for_formula(formula_i, formula)] peaks_df = pd.DataFrame(peaks, columns=['formula_i', 'peak_i', 'mz', 'int']) peaks_df.set_index('formula_i', inplace=True) print(f'Storing centroids chunk {id}') centroids_chunk_key = f'{centroids_chunks_prefix}/{id}.msgpack' storage.put_object(Bucket=bucket, Key=centroids_chunk_key, Body=peaks_df.to_msgpack()) return peaks_df.shape[0] from annotation_pipeline.isocalc_wrapper import IsocalcWrapper # Import lazily so that the rest of the pipeline still works if the dependency is missing isocalc_wrapper = IsocalcWrapper({ # These instrument settings are usually customized on a per-dataset basis out of a set of # 18 possible combinations, but most of EMBL's datasets are compatible with the following settings: 'charge': { 'polarity': polarity, 'n_charges': 1, }, 'isocalc_sigma': float(f"{isocalc_sigma:f}") # Rounding to match production implementation }) pw = pywren.ibm_cf_executor(config=config) memory_capacity_mb = 2048 futures = pw.map(calculate_peaks_chunk, f'cos://{bucket}/{formulas_chunks_prefix}/', runtime_memory=memory_capacity_mb) centroids_chunks_n = pw.get_result(futures) append_pywren_stats(futures, memory_mb=memory_capacity_mb, cloud_objects_n=len(futures)) num_centroids = sum(centroids_chunks_n) n_centroids_chunks = len(centroids_chunks_n) logger.info(f'Calculated {num_centroids} centroids in {n_centroids_chunks} chunks') return num_centroids, n_centroids_chunks
def get_imzml_reader(pw, imzml_path): def get_portable_imzml_reader(storage): imzml_stream = requests.get(imzml_path, stream=True).raw parser = ImzMLParser(imzml_stream, ibd_file=None) imzml_reader = parser.portable_spectrum_reader() imzml_cobject = storage.put_cobject(pickle.dumps(imzml_reader)) return imzml_reader, imzml_cobject memory_capacity_mb = 1024 future = pw.call_async(get_portable_imzml_reader, []) imzml_reader, imzml_cobject = pw.get_result(future) append_pywren_stats(future, memory_mb=memory_capacity_mb, cloud_objects_n=1) return imzml_reader, imzml_cobject
def clip_centr_df(pw, bucket, centr_chunks_prefix, clip_centr_chunk_prefix, mz_min, mz_max): def clip_centr_df_chunk(obj, id, ibm_cos): print(f'Clipping centroids dataframe chunk {obj.key}') centroids_df_chunk = pd.read_msgpack(obj.data_stream._raw_stream).sort_values('mz') centroids_df_chunk = centroids_df_chunk[centroids_df_chunk.mz > 0] ds_mz_range_unique_formulas = centroids_df_chunk[(mz_min < centroids_df_chunk.mz) & (centroids_df_chunk.mz < mz_max)].index.unique() centr_df_chunk = centroids_df_chunk[centroids_df_chunk.index.isin(ds_mz_range_unique_formulas)].reset_index() ibm_cos.put_object(Bucket=bucket, Key=f'{clip_centr_chunk_prefix}/{id}.msgpack', Body=centr_df_chunk.to_msgpack()) return centr_df_chunk.shape[0] memory_capacity_mb = 512 futures = pw.map(clip_centr_df_chunk, f'{bucket}/{centr_chunks_prefix}/', runtime_memory=memory_capacity_mb) centr_n = sum(pw.get_result(futures)) append_pywren_stats(futures, memory=memory_capacity_mb, plus_objects=len(futures)) logger.info(f'Prepared {centr_n} centroids') return centr_n
def define_centr_segments(pw, bucket, clip_centr_chunk_prefix, centr_n, ds_segm_n, ds_segm_size_mb): logger.info('Defining centroids segments bounds') def get_first_peak_mz(obj): print(f'Extracting first peak mz values from clipped centroids dataframe {obj.key}') centr_df = pd.read_msgpack(obj.data_stream._raw_stream) first_peak_df = centr_df[centr_df.peak_i == 0] return first_peak_df.mz.values memory_capacity_mb = 512 futures = pw.map(get_first_peak_mz, f'{bucket}/{clip_centr_chunk_prefix}/', runtime_memory=memory_capacity_mb) first_peak_df_mz = np.concatenate(pw.get_result(futures)) append_pywren_stats(futures, memory=memory_capacity_mb) ds_size_mb = ds_segm_n * ds_segm_size_mb data_per_centr_segm_mb = 50 peaks_per_centr_segm = 1e4 centr_segm_n = int(max(ds_size_mb // data_per_centr_segm_mb, centr_n // peaks_per_centr_segm, 32)) segm_bounds_q = [i * 1 / centr_segm_n for i in range(0, centr_segm_n)] centr_segm_lower_bounds = np.quantile(first_peak_df_mz, segm_bounds_q) logger.info(f'Generated {len(centr_segm_lower_bounds)} centroids bounds: {centr_segm_lower_bounds[0]}...{centr_segm_lower_bounds[-1]}') return centr_segm_lower_bounds
def chunk_spectra(pw, ibd_path, imzml_cobject, imzml_reader): MAX_CHUNK_SIZE = 512 * 1024**2 # 512MB sp_id_to_idx = get_pixel_indices(imzml_reader.coordinates) row_size = 3 * max(4, np.dtype(imzml_reader.mzPrecision).itemsize, np.dtype(imzml_reader.intensityPrecision).itemsize) def plan_chunks(): chunk_sp_inds = [] estimated_size_mb = 0 # Iterate in the same order that intensities are laid out in the file, hopefully this will # prevent fragmented read patterns for sp_i in np.argsort(imzml_reader.intensityOffsets): spectrum_size = imzml_reader.mzLengths[sp_i] * row_size if estimated_size_mb + spectrum_size > MAX_CHUNK_SIZE: estimated_size_mb = 0 yield np.array(chunk_sp_inds) chunk_sp_inds = [] estimated_size_mb += spectrum_size chunk_sp_inds.append(sp_i) if chunk_sp_inds: yield np.array(chunk_sp_inds) def upload_chunk(ch_i, storage): chunk_sp_inds = chunks[ch_i] # Get imzml_reader from COS because it's too big to include via pywren captured vars imzml_reader = pickle.loads( read_cloud_object_with_retry(storage, imzml_cobject)) n_spectra = sum(imzml_reader.mzLengths[sp_i] for sp_i in chunk_sp_inds) sp_mz_int_buf = np.zeros((n_spectra, 3), dtype=imzml_reader.mzPrecision) chunk_start = 0 for sp_i, mzs, ints in get_spectra(ibd_path, imzml_reader, chunk_sp_inds): chunk_end = chunk_start + len(mzs) sp_mz_int_buf[chunk_start:chunk_end, 0] = sp_id_to_idx[sp_i] sp_mz_int_buf[chunk_start:chunk_end, 1] = mzs sp_mz_int_buf[chunk_start:chunk_end, 2] = ints chunk_start = chunk_end by_mz = np.argsort(sp_mz_int_buf[:, 1]) sp_mz_int_buf = sp_mz_int_buf[by_mz] del by_mz chunk = msgpack.dumps(sp_mz_int_buf) size = sys.getsizeof(chunk) * (1 / 1024**2) logger.info(f'Uploading spectra chunk {ch_i} - %.2f MB' % size) chunk_cobject = storage.put_cobject(chunk) logger.info(f'Spectra chunk {ch_i} finished') return chunk_cobject chunks = list(plan_chunks()) memory_capacity_mb = 3072 futures = pw.map(upload_chunk, range(len(chunks)), runtime_memory=memory_capacity_mb) ds_chunks_cobjects = pw.get_result(futures) append_pywren_stats(futures, memory_mb=memory_capacity_mb, cloud_objects_n=len(chunks)) return ds_chunks_cobjects
def segment_centroids(pw, clip_centr_chunks_cobjects, centr_segm_lower_bounds, ds_segms_bounds, ds_segm_size_mb, max_ds_segms_size_per_db_segm_mb, ppm): centr_segm_n = len(centr_segm_lower_bounds) centr_segm_lower_bounds = centr_segm_lower_bounds.copy() # define first level segmentation and then segment each one into desired number first_level_centr_segm_n = min(32, len(centr_segm_lower_bounds)) centr_segm_lower_bounds = np.array_split(centr_segm_lower_bounds, first_level_centr_segm_n) first_level_centr_segm_bounds = np.array( [bounds[0] for bounds in centr_segm_lower_bounds]) def segment_centr_df(centr_df, db_segm_lower_bounds): first_peak_df = centr_df[centr_df.peak_i == 0].copy() segment_mapping = np.searchsorted( db_segm_lower_bounds, first_peak_df.mz.values, side='right') - 1 first_peak_df['segm_i'] = segment_mapping centr_segm_df = pd.merge(centr_df, first_peak_df[['formula_i', 'segm_i']], on='formula_i').sort_values('mz') return centr_segm_df def segment_centr_chunk(cobject, id, storage): print(f'Segmenting clipped centroids dataframe chunk {id}') centr_df = read_cloud_object_with_retry(storage, cobject, pd.read_msgpack) centr_segm_df = segment_centr_df(centr_df, first_level_centr_segm_bounds) def _first_level_upload(args): segm_i, df = args del df['segm_i'] return segm_i, storage.put_cobject(df.to_msgpack()) with ThreadPoolExecutor(max_workers=128) as pool: sub_segms = [(segm_i, df) for segm_i, df in centr_segm_df.groupby('segm_i')] sub_segms_cobjects = list(pool.map(_first_level_upload, sub_segms)) return dict(sub_segms_cobjects) memory_capacity_mb = 512 first_futures = pw.map(segment_centr_chunk, clip_centr_chunks_cobjects, runtime_memory=memory_capacity_mb) first_level_segms_cobjects = pw.get_result(first_futures) append_pywren_stats(first_futures, memory_mb=memory_capacity_mb, cloud_objects_n=len(first_futures) * len(centr_segm_lower_bounds)) def merge_centr_df_segments(segm_cobjects, id, storage): print(f'Merging segment {id} clipped centroids chunks') def _merge(cobject): segm_centr_df_chunk = read_cloud_object_with_retry( storage, cobject, pd.read_msgpack) return segm_centr_df_chunk with ThreadPoolExecutor(max_workers=128) as pool: segm = pd.concat(list(pool.map(_merge, segm_cobjects))) def _second_level_segment(segm, sub_segms_n): segm_bounds_q = [ i * 1 / sub_segms_n for i in range(0, sub_segms_n) ] sub_segms_lower_bounds = np.quantile( segm[segm.peak_i == 0].mz.values, segm_bounds_q) centr_segm_df = segment_centr_df(segm, sub_segms_lower_bounds) sub_segms = [] for segm_i, df in centr_segm_df.groupby('segm_i'): del df['segm_i'] sub_segms.append(df) return sub_segms init_segms = _second_level_segment(segm, len(centr_segm_lower_bounds[id])) from annotation_pipeline.image import choose_ds_segments segms = [] for init_segm in init_segms: first_ds_segm_i, last_ds_segm_i = choose_ds_segments( ds_segms_bounds, init_segm, ppm) ds_segms_to_download_n = last_ds_segm_i - first_ds_segm_i + 1 segms.append((ds_segms_to_download_n, init_segm)) segms = sorted(segms, key=lambda x: x[0], reverse=True) max_ds_segms_to_download_n, max_segm = segms[0] max_iterations_n = 100 iterations_n = 1 while max_ds_segms_to_download_n * ds_segm_size_mb > max_ds_segms_size_per_db_segm_mb and iterations_n < max_iterations_n: sub_segms = [] sub_segms_n = math.ceil(max_ds_segms_to_download_n * ds_segm_size_mb / max_ds_segms_size_per_db_segm_mb) for sub_segm in _second_level_segment(max_segm, sub_segms_n): first_ds_segm_i, last_ds_segm_i = choose_ds_segments( ds_segms_bounds, sub_segm, ppm) ds_segms_to_download_n = last_ds_segm_i - first_ds_segm_i + 1 sub_segms.append((ds_segms_to_download_n, sub_segm)) segms = sub_segms + segms[1:] segms = sorted(segms, key=lambda x: x[0], reverse=True) iterations_n += 1 max_ds_segms_to_download_n, max_segm = segms[0] def _second_level_upload(df): return storage.put_cobject(df.to_msgpack()) print(f'Storing {len(segms)} centroids segments') with ThreadPoolExecutor(max_workers=128) as pool: segms = [df for _, df in segms] segms_cobjects = list(pool.map(_second_level_upload, segms)) return segms_cobjects from collections import defaultdict second_level_segms_cobjects = defaultdict(list) for sub_segms_cobjects in first_level_segms_cobjects: for first_level_segm_i in sub_segms_cobjects: second_level_segms_cobjects[first_level_segm_i].append( sub_segms_cobjects[first_level_segm_i]) second_level_segms_cobjects = sorted(second_level_segms_cobjects.items(), key=lambda x: x[0]) second_level_segms_cobjects = [ [cobjects] for segm_i, cobjects in second_level_segms_cobjects ] memory_capacity_mb = 2048 second_futures = pw.map(merge_centr_df_segments, second_level_segms_cobjects, runtime_memory=memory_capacity_mb) db_segms_cobjects = list(np.concatenate(pw.get_result(second_futures))) append_pywren_stats(second_futures, memory_mb=memory_capacity_mb, cloud_objects_n=centr_segm_n) return db_segms_cobjects
def segment_spectra(pw, bucket, ds_chunks_prefix, ds_segments_prefix, ds_segments_bounds, ds_segm_size_mb, segm_dtype): ds_segm_n = len(ds_segments_bounds) # extend boundaries of the first and last segments # to include all mzs outside of the spectra sample mz range ds_segments_bounds = ds_segments_bounds.copy() ds_segments_bounds[0, 0] = 0 ds_segments_bounds[-1, 1] = MAX_MZ_VALUE # define first level segmentation and then segment each one into desired number first_level_segm_size_mb = 512 first_level_segm_n = (len(ds_segments_bounds) * ds_segm_size_mb) // first_level_segm_size_mb first_level_segm_n = max(first_level_segm_n, 1) ds_segments_bounds = np.array_split(ds_segments_bounds, first_level_segm_n) def segment_spectra_chunk(obj, id, ibm_cos): print(f'Segmenting spectra chunk {obj.key}') sp_mz_int_buf = msgpack.loads(obj.data_stream.read()) def _first_level_segment_upload(segm_i): l = ds_segments_bounds[segm_i][0, 0] r = ds_segments_bounds[segm_i][-1, 1] segm_start, segm_end = np.searchsorted(sp_mz_int_buf[:, 1], (l, r)) # mz expected to be in column 1 segm = sp_mz_int_buf[segm_start:segm_end] ibm_cos.put_object(Bucket=bucket, Key=f'{ds_segments_prefix}/chunk/{segm_i}/{id}.msgpack', Body=msgpack.dumps(segm)) with ThreadPoolExecutor(max_workers=128) as pool: pool.map(_first_level_segment_upload, range(len(ds_segments_bounds))) memory_safe_mb = 1024 memory_capacity_mb = first_level_segm_size_mb * 2 + memory_safe_mb first_futures = pw.map(segment_spectra_chunk, f'{bucket}/{ds_chunks_prefix}/', runtime_memory=memory_capacity_mb) pw.get_result(first_futures) if not isinstance(first_futures, list): first_futures = [first_futures] append_pywren_stats(first_futures, memory=memory_capacity_mb, plus_objects=len(first_futures) * len(ds_segments_bounds)) def merge_spectra_chunk_segments(segm_i, ibm_cos): print(f'Merging segment {segm_i} spectra chunks') keys = list_keys(bucket, f'{ds_segments_prefix}/chunk/{segm_i}/', ibm_cos) def _merge(key): segm_spectra_chunk = read_object_with_retry(ibm_cos, bucket, key, msgpack.load) return segm_spectra_chunk with ThreadPoolExecutor(max_workers=128) as pool: segm = list(pool.map(_merge, keys)) segm = np.concatenate(segm) # Alternative in-place sorting (slower) : # segm.view(f'{segm_dtype},{segm_dtype},{segm_dtype}').sort(order=['f1'], axis=0) segm = segm[segm[:, 1].argsort()] clean_from_cos(None, bucket, f'{ds_segments_prefix}/chunk/{segm_i}/', ibm_cos) bounds_list = ds_segments_bounds[segm_i] segms_len = [] for segm_j in range(len(bounds_list)): l, r = bounds_list[segm_j] segm_start, segm_end = np.searchsorted(segm[:, 1], (l, r)) # mz expected to be in column 1 sub_segm = segm[segm_start:segm_end] segms_len.append(len(sub_segm)) base_id = sum([len(bounds) for bounds in ds_segments_bounds[:segm_i]]) id = base_id + segm_j print(f'Storing dataset segment {id}') ibm_cos.put_object(Bucket=bucket, Key=f'{ds_segments_prefix}/{id}.msgpack', Body=msgpack.dumps(sub_segm)) return segms_len # same memory capacity second_futures = pw.map(merge_spectra_chunk_segments, range(len(ds_segments_bounds)), runtime_memory=memory_capacity_mb) ds_segms_len = list(np.concatenate(pw.get_result(second_futures))) append_pywren_stats(second_futures, memory=memory_capacity_mb, plus_objects=ds_segm_n, minus_objects=len(first_futures) * len(ds_segments_bounds)) return ds_segm_n, ds_segms_len
def segment_centroids(pw, bucket, clip_centr_chunk_prefix, centr_segm_prefix, centr_segm_lower_bounds): centr_segm_n = len(centr_segm_lower_bounds) centr_segm_lower_bounds = centr_segm_lower_bounds.copy() # define first level segmentation and then segment each one into desired number first_level_centr_segm_n = min(32, len(centr_segm_lower_bounds)) centr_segm_lower_bounds = np.array_split(centr_segm_lower_bounds, first_level_centr_segm_n) first_level_centr_segm_bounds = np.array([bounds[0] for bounds in centr_segm_lower_bounds]) def segment_centr_df(centr_df, db_segm_lower_bounds): first_peak_df = centr_df[centr_df.peak_i == 0].copy() segment_mapping = np.searchsorted(db_segm_lower_bounds, first_peak_df.mz.values, side='right') - 1 first_peak_df['segm_i'] = segment_mapping centr_segm_df = pd.merge(centr_df, first_peak_df[['formula_i', 'segm_i']], on='formula_i').sort_values('mz') return centr_segm_df def segment_centr_chunk(obj, id, ibm_cos): print(f'Segmenting clipped centroids dataframe chunk {obj.key}') centr_df = pd.read_msgpack(obj.data_stream._raw_stream) centr_segm_df = segment_centr_df(centr_df, first_level_centr_segm_bounds) def _first_level_upload(args): segm_i, df = args ibm_cos.put_object(Bucket=bucket, Key=f'{centr_segm_prefix}/chunk/{segm_i}/{id}.msgpack', Body=df.to_msgpack()) with ThreadPoolExecutor(max_workers=128) as pool: pool.map(_first_level_upload, [(segm_i, df) for segm_i, df in centr_segm_df.groupby('segm_i')]) memory_capacity_mb = 512 first_futures = pw.map(segment_centr_chunk, f'{bucket}/{clip_centr_chunk_prefix}/', runtime_memory=memory_capacity_mb) pw.get_result(first_futures) append_pywren_stats(first_futures, memory=memory_capacity_mb, plus_objects=len(first_futures) * len(centr_segm_lower_bounds)) def merge_centr_df_segments(segm_i, ibm_cos): print(f'Merging segment {segm_i} clipped centroids chunks') keys = list_keys(bucket, f'{centr_segm_prefix}/chunk/{segm_i}/', ibm_cos) def _merge(key): segm_centr_df_chunk = read_object_with_retry(ibm_cos, bucket, key, pd.read_msgpack) return segm_centr_df_chunk with ThreadPoolExecutor(max_workers=128) as pool: segm = pd.concat(list(pool.map(_merge, keys))) del segm['segm_i'] clean_from_cos(None, bucket, f'{centr_segm_prefix}/chunk/{segm_i}/', ibm_cos) centr_segm_df = segment_centr_df(segm, centr_segm_lower_bounds[segm_i]) def _second_level_upload(args): segm_j, df = args base_id = sum([len(bounds) for bounds in centr_segm_lower_bounds[:segm_i]]) id = base_id + segm_j print(f'Storing centroids segment {id}') ibm_cos.put_object(Bucket=bucket, Key=f'{centr_segm_prefix}/{id}.msgpack', Body=df.to_msgpack()) with ThreadPoolExecutor(max_workers=128) as pool: pool.map(_second_level_upload, [(segm_i, df) for segm_i, df in centr_segm_df.groupby('segm_i')]) memory_capacity_mb = 1024 second_futures = pw.map(merge_centr_df_segments, range(len(centr_segm_lower_bounds)), runtime_memory=memory_capacity_mb) pw.get_result(second_futures) append_pywren_stats(second_futures, memory=memory_capacity_mb, plus_objects=centr_segm_n, minus_objects=len(first_futures) * len(centr_segm_lower_bounds)) return centr_segm_n
def build_fdr_rankings(pw, bucket, input_data, input_db, formula_scores_df): def build_ranking(group_i, ranking_i, database, modifier, adduct, id, storage): print("Building ranking...") print(f'job_i: {id}') print(f'ranking_i: {ranking_i}') print(f'database: {database}') print(f'modifier: {modifier}') print(f'adduct: {adduct}') # For every unmodified formula in `database`, look up the MSM score for the molecule # that it would become after the modifier and adduct are applied mols = pickle.loads(read_object_with_retry(storage, bucket, database)) if adduct is not None: # Target rankings use the same adduct for all molecules mol_formulas = list( map(safe_generate_ion_formula, mols, repeat(modifier), repeat(adduct))) else: # Decoy rankings use a consistent random adduct for each molecule, chosen so that it doesn't overlap # with other decoy rankings for this molecule adducts = _get_random_adduct_set(len(mols), decoy_adducts, ranking_i) mol_formulas = list( map(safe_generate_ion_formula, mols, repeat(modifier), adducts)) formula_to_id = {} keys = list_keys(bucket, f'{input_db["formula_to_id_chunks"]}/', storage) for key in keys: formula_to_id_chunk = read_object_with_retry( storage, bucket, key, msgpack_load_text) for formula in mol_formulas: if formula_to_id_chunk.get(formula) is not None: formula_to_id[formula] = formula_to_id_chunk.get(formula) formula_is = [ formula and formula_to_id.get(formula) for formula in mol_formulas ] msm = [ formula_i and msm_lookup.get(formula_i) for formula_i in formula_is ] if adduct is not None: ranking_df = pd.DataFrame({ 'mol': mols, 'msm': msm }, index=formula_is) ranking_df = ranking_df[~ranking_df.msm.isna()] else: # Specific molecules don't matter in the decoy rankings, only their msm distribution ranking_df = pd.DataFrame({'msm': msm}) ranking_df = ranking_df[~ranking_df.msm.isna()] return id, storage.put_cobject(pickle.dumps(ranking_df)) decoy_adducts = sorted(set(DECOY_ADDUCTS).difference(input_db['adducts'])) n_decoy_rankings = input_data.get('num_decoys', len(decoy_adducts)) msm_lookup = formula_scores_df.msm.to_dict( ) # Ideally this data would stay in COS so it doesn't have to be reuploaded # Create a job for each list of molecules to be ranked ranking_jobs = [] for group_i, (database, modifier) in enumerate( product(input_db['databases'], input_db['modifiers'])): # Target and decoy rankings are treated differently. Decoy rankings are identified by not having an adduct. ranking_jobs.extend( (group_i, ranking_i, database, modifier, adduct) for ranking_i, adduct in enumerate(input_db['adducts'])) ranking_jobs.extend((group_i, ranking_i, database, modifier, None) for ranking_i in range(n_decoy_rankings)) memory_capacity_mb = 1536 futures = pw.map(build_ranking, ranking_jobs, runtime_memory=memory_capacity_mb) ranking_cobjects = [ cobject for job_i, cobject in sorted(pw.get_result(futures)) ] append_pywren_stats(futures, memory_mb=memory_capacity_mb, cloud_objects_n=len(futures)) rankings_df = pd.DataFrame(ranking_jobs, columns=[ 'group_i', 'ranking_i', 'database_path', 'modifier', 'adduct' ]) rankings_df = rankings_df.assign(is_target=~rankings_df.adduct.isnull(), cobject=ranking_cobjects) return rankings_df
def build_database(config, input_db): bucket = config["storage"]["db_bucket"] formulas_chunks_prefix = input_db["formulas_chunks"] clean_from_cos(config, bucket, formulas_chunks_prefix) adducts = [*input_db['adducts'], *DECOY_ADDUCTS] modifiers = input_db['modifiers'] databases = input_db['databases'] N_HASH_SEGMENTS = 32 # should be less than N_FORMULAS_SEGMENTS def hash_formula_to_segment(formula): m = hashlib.md5() m.update(formula.encode('utf-8')) return int(m.hexdigest(), 16) % N_HASH_SEGMENTS def generate_formulas(adduct, ibm_cos): print(f'Generating formulas for adduct {adduct}') def _get_mols(mols_key): return pickle.loads( read_object_with_retry(ibm_cos, bucket, mols_key)) with ThreadPoolExecutor(max_workers=128) as pool: mols_list = list(pool.map(_get_mols, databases)) formulas = set() for mols in mols_list: for modifier in modifiers: formulas.update( map(safe_generate_ion_formula, mols, repeat(modifier), repeat(adduct))) if None in formulas: formulas.remove(None) formulas_segments = {} for formula in formulas: segm_i = hash_formula_to_segment(formula) if segm_i in formulas_segments: formulas_segments[segm_i].append(formula) else: formulas_segments[segm_i] = [formula] def _store(segm_i): ibm_cos.put_object( Bucket=bucket, Key=f'{formulas_chunks_prefix}/chunk/{segm_i}/{adduct}.pickle', Body=pickle.dumps(formulas_segments[segm_i])) segments_n = [segm_i for segm_i in formulas_segments] with ThreadPoolExecutor(max_workers=128) as pool: pool.map(_store, segments_n) return segments_n pw = pywren.ibm_cf_executor(config=config) memory_capacity_mb = 512 futures = pw.map(generate_formulas, adducts, runtime_memory=memory_capacity_mb) segments_n = list(set().union(*pw.get_result(futures))) append_pywren_stats(futures, memory=memory_capacity_mb, plus_objects=len(adducts) * len(segments_n)) def deduplicate_formulas_segment(segm_i, ibm_cos, clean=True): print(f'Deduplicating formulas segment {segm_i}') keys = list_keys(bucket, f'{formulas_chunks_prefix}/chunk/{segm_i}/', ibm_cos) segm = set() for key in keys: segm_formulas_chunk = pickle.loads( read_object_with_retry(ibm_cos, bucket, key)) segm.update(segm_formulas_chunk) if clean: clean_from_cos(config, bucket, f'{formulas_chunks_prefix}/chunk/{segm_i}/', ibm_cos) return segm def get_formulas_number_per_chunk(segm_i, ibm_cos): segm = deduplicate_formulas_segment(segm_i, ibm_cos, clean=False) return len(segm) pw = pywren.ibm_cf_executor(config=config) memory_capacity_mb = 512 futures = pw.map(get_formulas_number_per_chunk, segments_n, runtime_memory=memory_capacity_mb) formulas_nums = pw.get_result(futures) append_pywren_stats(futures, memory=memory_capacity_mb) def store_formulas_segment(segm_i, ibm_cos): segm = deduplicate_formulas_segment(segm_i, ibm_cos) formula_i_start = sum(formulas_nums[:segm_i]) formula_i_end = formula_i_start + len(segm) segm = pd.DataFrame(sorted(segm), columns=['formula'], index=pd.RangeIndex(formula_i_start, formula_i_end, name='formula_i')) ibm_cos.put_object( Bucket=bucket, Key=f'{formulas_chunks_prefix}_fdr/{segm_i}.msgpack', Body=segm.to_msgpack()) n_threads = N_FORMULAS_SEGMENTS // N_HASH_SEGMENTS subsegm_size = math.ceil(len(segm) / n_threads) segm_list = [ segm[i:i + subsegm_size] for i in range(0, segm.shape[0], subsegm_size) ] def _store(segm_j): id = segm_i * n_threads + segm_j print(f'Storing formulas segment {id}') ibm_cos.put_object(Bucket=bucket, Key=f'{formulas_chunks_prefix}/{id}.msgpack', Body=segm_list[segm_j].to_msgpack()) with ThreadPoolExecutor(max_workers=128) as pool: pool.map(_store, range(n_threads)) return [len(segm) for segm in segm_list] pw = pywren.ibm_cf_executor(config=config) memory_capacity_mb = 512 futures = pw.map(store_formulas_segment, segments_n, runtime_memory=memory_capacity_mb) results = pw.get_result(futures) append_pywren_stats(futures, memory=memory_capacity_mb, plus_objects=N_FORMULAS_SEGMENTS, minus_objects=len(adducts) * len(segments_n)) num_formulas = sum(formulas_nums) n_formulas_chunks = sum([len(result) for result in results]) logger.info( f'Generated {num_formulas} formulas in {n_formulas_chunks} chunks') formula_to_id_chunks_prefix = input_db["formula_to_id_chunks"] clean_from_cos(config, bucket, formula_to_id_chunks_prefix) formulas_bytes = 200 * num_formulas formula_to_id_chunk_mb = 512 N_FORMULA_TO_ID = int( math.ceil(formulas_bytes / (formula_to_id_chunk_mb * 1024**2))) def store_formula_to_id_chunk(ch_i, ibm_cos): print(f'Storing formula_to_id dictionary chunk {ch_i}') start_id = (N_FORMULAS_SEGMENTS // N_FORMULA_TO_ID) * ch_i end_id = (N_FORMULAS_SEGMENTS // N_FORMULA_TO_ID) * (ch_i + 1) keys = [ f'{formulas_chunks_prefix}/{formulas_chunk}.msgpack' for formulas_chunk in range(start_id, end_id) ] def _get(key): formula_chunk = read_object_with_retry(ibm_cos, bucket, key, pd.read_msgpack) formula_to_id_chunk = dict( zip(formula_chunk.formula, formula_chunk.index)) return formula_to_id_chunk with ThreadPoolExecutor(max_workers=128) as pool: results = list(pool.map(_get, keys)) formula_to_id = {} for chunk_dict in results: formula_to_id.update(chunk_dict) ibm_cos.put_object(Bucket=bucket, Key=f'{formula_to_id_chunks_prefix}/{ch_i}.msgpack', Body=msgpack.dumps(formula_to_id)) pw = pywren.ibm_cf_executor(config=config) safe_mb = 512 memory_capacity_mb = formula_to_id_chunk_mb * 2 + safe_mb futures = pw.map(store_formula_to_id_chunk, range(N_FORMULA_TO_ID), runtime_memory=memory_capacity_mb) pw.get_result(futures) append_pywren_stats(futures, memory=memory_capacity_mb, plus_objects=N_FORMULA_TO_ID) logger.info(f'Built {N_FORMULA_TO_ID} formula_to_id dictionaries chunks') return num_formulas, n_formulas_chunks
def segment_spectra(pw, ds_chunks_cobjects, ds_segments_bounds, ds_segm_size_mb, ds_segm_dtype): ds_segm_n = len(ds_segments_bounds) # extend boundaries of the first and last segments # to include all mzs outside of the spectra sample mz range ds_segments_bounds = ds_segments_bounds.copy() ds_segments_bounds[0, 0] = 0 ds_segments_bounds[-1, 1] = MAX_MZ_VALUE # define first level segmentation and then segment each one into desired number first_level_segm_size_mb = 512 first_level_segm_n = (len(ds_segments_bounds) * ds_segm_size_mb) // first_level_segm_size_mb first_level_segm_n = max(first_level_segm_n, 1) ds_segments_bounds = np.array_split(ds_segments_bounds, first_level_segm_n) def segment_spectra_chunk(chunk_cobject, id, storage): print(f'Segmenting spectra chunk {id}') sp_mz_int_buf = read_cloud_object_with_retry(storage, chunk_cobject, msgpack.load) def _first_level_segment_upload(segm_i): l = ds_segments_bounds[segm_i][0, 0] r = ds_segments_bounds[segm_i][-1, 1] segm_start, segm_end = np.searchsorted( sp_mz_int_buf[:, 1], (l, r)) # mz expected to be in column 1 segm = sp_mz_int_buf[segm_start:segm_end] return storage.put_cobject(msgpack.dumps(segm)) with ThreadPoolExecutor(max_workers=128) as pool: sub_segms_cobjects = list( pool.map(_first_level_segment_upload, range(len(ds_segments_bounds)))) return sub_segms_cobjects memory_safe_mb = 1536 memory_capacity_mb = first_level_segm_size_mb * 2 + memory_safe_mb first_futures = pw.map(segment_spectra_chunk, ds_chunks_cobjects, runtime_memory=memory_capacity_mb) first_level_segms_cobjects = pw.get_result(first_futures) if not isinstance(first_futures, list): first_futures = [first_futures] append_pywren_stats(first_futures, memory_mb=memory_capacity_mb, cloud_objects_n=len(first_futures) * len(ds_segments_bounds)) def merge_spectra_chunk_segments(segm_cobjects, id, storage): print(f'Merging segment {id} spectra chunks') def _merge(ch_i): segm_spectra_chunk = read_cloud_object_with_retry( storage, segm_cobjects[ch_i], msgpack.load) return segm_spectra_chunk with ThreadPoolExecutor(max_workers=128) as pool: segm = list(pool.map(_merge, range(len(segm_cobjects)))) segm = np.concatenate(segm) # Alternative in-place sorting (slower) : # segm.view(f'{ds_segm_dtype},{ds_segm_dtype},{ds_segm_dtype}').sort(order=['f1'], axis=0) segm = segm[segm[:, 1].argsort()] bounds_list = ds_segments_bounds[id] segms_len = [] segms_cobjects = [] for segm_j in range(len(bounds_list)): l, r = bounds_list[segm_j] segm_start, segm_end = np.searchsorted( segm[:, 1], (l, r)) # mz expected to be in column 1 sub_segm = segm[segm_start:segm_end] segms_len.append(len(sub_segm)) base_id = sum([len(bounds) for bounds in ds_segments_bounds[:id]]) segm_i = base_id + segm_j print(f'Storing dataset segment {segm_i}') segms_cobjects.append(storage.put_cobject(msgpack.dumps(sub_segm))) return segms_len, segms_cobjects second_level_segms_cobjects = np.transpose( first_level_segms_cobjects).tolist() second_level_segms_cobjects = [ [segm_cobjects] for segm_cobjects in second_level_segms_cobjects ] # same memory capacity second_futures = pw.map(merge_spectra_chunk_segments, second_level_segms_cobjects, runtime_memory=memory_capacity_mb) ds_segms_len, ds_segms_cobjects = list(zip(*pw.get_result(second_futures))) ds_segms_len = list(np.concatenate(ds_segms_len)) ds_segms_cobjects = list(np.concatenate(ds_segms_cobjects)) append_pywren_stats(second_futures, memory_mb=memory_capacity_mb, cloud_objects_n=ds_segm_n) return ds_segms_cobjects, ds_segms_len