def save(self, db_data_cobjs: List[CObj[DbFDRData]], peaks_cobjs: List[CObj[pd.DataFrame]]): def batch_copy(src_cobjs: List[CloudObject], dest_prefix: str, *, storage: Storage): # If Lithops' storage supported Copy Object operations, this could be easily optimized. # Not sure if it's worth the effort yet result_cobjs = [] for i, data in enumerate( iter_cobjects_with_prefetch(storage, src_cobjs)): dest_key = f'{dest_prefix}/{i:06}' result_cobjs.append( storage.put_cloudobject(data, dest_bucket, dest_key)) return result_cobjs dest_bucket = self.bucket # Copy cobjs to the cache dir new_db_data_cobjs, new_peaks_cobjs = self.executor.map( batch_copy, [(db_data_cobjs, f'{self.prefix}/db_data'), (peaks_cobjs, f'{self.prefix}/peaks')], runtime_memory=1024, ) # Save config in case it's needed for debugging self.storage.put_cloudobject(json.dumps(self.ds_config, indent=4), self.bucket, self.config_key) # Save list of cobjects. This list would be easy to reconstruct by listing keys, but # saving a separate object as the last step of the process is helpful to confirm that # the cache item is complete, and didn't partially fail to copy. save_cobj(self.storage, (new_db_data_cobjs, new_peaks_cobjs), self.bucket, self.meta_key) return new_db_data_cobjs, new_peaks_cobjs
def test_get_moldb_centroids(LockMock, executor: Executor, sm_config, ds_config): formulas0 = ['H2O', 'CO2'] formulas1 = ['H2SO4', 'CO2'] formulas2 = ['H2SO4', 'NH4'] moldbs: List[InputMolDb] = [ { 'id': 0, 'targeted': False, 'cobj': save_cobj(executor.storage, formulas0) }, { 'id': 1, 'targeted': True, 'cobj': save_cobj(executor.storage, formulas1) }, { 'id': 2, 'targeted': False, 'cobj': save_cobj(executor.storage, formulas2) }, ] db_data_cobjs, peaks_cobjs = get_moldb_centroids( executor, sm_config['lithops']['sm_storage'], ds_config, moldbs, debug_validate=True, use_cache=False, ) db_data = load_cobjs(executor.storage, db_data_cobjs) peaks_df = pd.concat(load_cobjs(executor.storage, peaks_cobjs)) map_df0 = db_data[0]['formula_map_df'].set_index(['formula', 'modifier']) map_df1 = db_data[1]['formula_map_df'].set_index(['formula', 'modifier']) map_df2 = db_data[2]['formula_map_df'].set_index(['formula', 'modifier']) h2o_formula_i0 = map_df0.loc[('H2O', '')].formula_i co2_formula_i0 = map_df0.loc[('CO2', '')].formula_i co2_formula_i1 = map_df1.loc[('CO2', '')].formula_i h2so4_formula_i1 = map_df1.loc[('H2SO4', '')].formula_i h2so4_formula_i2 = map_df2.loc[('H2SO4', '')].formula_i assert co2_formula_i0 == co2_formula_i1, 'formula_i values should be de-duplicated' assert h2so4_formula_i1 == h2so4_formula_i2, 'formula_i values should be de-duplicated' assert co2_formula_i0 != h2so4_formula_i2, 'formula_i values should not conflict' assert not peaks_df.loc[h2o_formula_i0].targeted.any( ), "H2O shouldn't be targeted as it's not in a targeted DB" assert peaks_df.loc[co2_formula_i0].targeted.any( ), "CO2 should be targeted as it's in a targeted DB" assert peaks_df.loc[h2so4_formula_i1].targeted.any( ), "H2SO4 should be targeted as it's in a targeted DB"
def upload_segm(start_end): start, end = start_end df = pd.DataFrame( {'mz': mzs[start:end], 'int': ints[start:end], 'sp_i': sp_idxs[start:end]}, index=pd.RangeIndex(start, end), ) return save_cobj(storage, df)
def _upload_moldbs_from_db(moldb_ids, storage, sm_storage): moldb_defs = [] bucket, prefix = sm_storage['moldb'] # Sort the moldbs because the centroids cache key is affected by their order for moldb_id in sorted(moldb_ids): key = f'{prefix}/{moldb_id}' try: storage.head_object(bucket, key) logger.debug(f'Found mol db at {key}') cobject = CloudObject(storage.backend, bucket, key) except StorageNoSuchKeyError: logger.info(f'Uploading {key}...') mols_query = DB().select( 'SELECT DISTINCT formula FROM molecule WHERE moldb_id = %s', (moldb_id, )) mols = [mol for mol, in mols_query] cobject = save_cobj(storage, mols, bucket=bucket, key=key) logger.info(f'Uploading {key}...Done') (targeted, ) = DB().select_one( 'SELECT targeted FROM molecular_db WHERE id = %s', (moldb_id, )) moldb_defs.append({ 'id': moldb_id, 'cobj': cobject, 'targeted': targeted }) return moldb_defs
def calculate_peaks_chunk(segm_i: int, segm_cobject: CObj[pd.DataFrame], *, storage: Storage): print(f'Calculating peaks from formulas chunk {segm_i}') chunk_df = load_cobj(storage, segm_cobject) chunk_iter = chunk_df[['ion_formula', 'target', 'targeted']].itertuples(True, None) peaks = list(chain(*map(calculate_peaks_for_formula, chunk_iter))) peaks_df = pd.DataFrame( peaks, columns=['formula_i', 'peak_i', 'mz', 'int', 'target', 'targeted'] ) peaks_df = peaks_df.astype( { 'formula_i': 'u4', 'peak_i': 'u1', 'mz': 'f8', 'int': 'f4', 'target': '?', 'targeted': '?', } ) peaks_df.set_index('formula_i', inplace=True) print(f'Storing centroids chunk {segm_i}') peaks_cobject = save_cobj(storage, peaks_df) return peaks_cobject, peaks_df.shape[0]
def run_coloc_job(moldb_id, image_ids, ion_ids, fdrs, *, storage): # Use web_app_url to get the publicly-exposed storage server address, because # Functions can't use the private address images, h, w = _get_images(ImageStorage(sm_config), ds_id, image_ids) cobjs = [] for job in analyze_colocalization(ds_id, moldb_id, images, ion_ids, fdrs, h, w): cobjs.append(save_cobj(storage, job)) return cobjs
def clip_centr_df_chunk(peaks_i, peaks_cobject, storage): print(f'Clipping centroids dataframe chunk {peaks_i}') centroids_df_chunk = load_cobj(storage, peaks_cobject).sort_values('mz') centroids_df_chunk = centroids_df_chunk[centroids_df_chunk.mz > 0] ds_mz_range_unique_formulas = centroids_df_chunk[ (mz_min < centroids_df_chunk.mz) & (centroids_df_chunk.mz < mz_max)].index.unique() centr_df_chunk = centroids_df_chunk[centroids_df_chunk.index.isin( ds_mz_range_unique_formulas)].reset_index() clip_centr_chunk_cobject = save_cobj(storage, centr_df_chunk) return clip_centr_chunk_cobject, centr_df_chunk.shape[0]
def save_png_chunk(df: pd.DataFrame, *, storage: Storage): pngs = [] groups = defaultdict(lambda: []) for formula_i, cobj in df.cobj.items(): groups[cobj].append(formula_i) image_dict_iter = iter_cobjs_with_prefetch(storage, list(groups.keys())) for image_dict, formula_is in zip(image_dict_iter, groups.values()): for formula_i in formula_is: formula_pngs = [ png_generator.generate_png(img.toarray()) if img is not None else None for img in image_dict[formula_i] ] pngs.append((formula_i, formula_pngs)) return save_cobj(storage, pngs)
def _upload_moldbs_from_files(file_paths, storage, sm_storage): moldb_defs = [] for file_path in file_paths: bucket, raw_key = _choose_cos_location(file_path, sm_storage, 'moldb') key = raw_key + '_formulas' try: storage.head_object(bucket, key) logger.debug(f'Found mol db at {key}') cobject = CloudObject(storage.backend, bucket, key) except StorageNoSuchKeyError: logger.info(f'Uploading {key}...') mols = read_moldb_file(file_path).formula cobject = save_cobj(storage, mols, bucket=bucket, key=key) logger.info(f'Uploading {key}...Done') moldb_defs.append({ 'id': Path(file_path).stem, 'cobj': cobject, 'targeted': False }) return moldb_defs
def _second_level_upload(formula_is): return save_cobj( storage, segm.loc[formula_is].sort_values('mz').reset_index())
def _first_level_upload(args): segm_i, df = args del df['segm_i'] return segm_i, save_cobj(storage, df)