def test_add_optical_image(self, fill_db, sm_config, ds_config): db = DB(sm_config['db']) action_queue_mock = MagicMock(spec=QueuePublisher) es_mock = MagicMock(spec=ESExporter) img_store_mock = MagicMock(ImageStoreServiceWrapper) img_store_mock.post_image.side_effect = [ 'opt_img_id1', 'opt_img_id2', 'opt_img_id3', 'thumbnail_id' ] img_store_mock.get_image_by_id.return_value = Image.new( 'RGB', (100, 100)) ds_man = create_api_ds_man(sm_config=sm_config, db=db, es=es_mock, img_store=img_store_mock, annot_queue=action_queue_mock) ds_man._annotation_image_shape = MagicMock(return_value=(100, 100)) ds_id = '2000-01-01' ds = create_ds(ds_id=ds_id, ds_config=ds_config) zoom_levels = [1, 2, 3] raw_img_id = 'raw_opt_img_id' ds_man.add_optical_image(ds, raw_img_id, [[1, 0, 0], [0, 1, 0], [0, 0, 1]], zoom_levels=zoom_levels) assert db.select('SELECT * FROM optical_image') == [ ('opt_img_id{}'.format(i + 1), ds.id, zoom) for i, zoom in enumerate(zoom_levels) ] assert db.select('SELECT optical_image FROM dataset where id = %s', params=(ds_id, )) == [(raw_img_id, )] assert db.select('SELECT thumbnail FROM dataset where id = %s', params=(ds_id, )) == [('thumbnail_id', )]
def run_off_sample(sm_config, ds_ids_str, sql_where, fix_missing, overwrite_existing): db = DB() ds_ids = None if ds_ids_str: ds_ids = ds_ids_str.split(',') elif sql_where: ds_ids = [ id for (id,) in db.select(f'SELECT DISTINCT dataset.id FROM dataset WHERE {sql_where}') ] elif fix_missing: logger.info('Checking for missing off-sample jobs...') results = db.select(MISSING_OFF_SAMPLE_SEL) ds_ids = [ds_id for ds_id, in results] logger.info(f'Found {len(ds_ids)} missing off-sample sets') if not ds_ids: logger.warning('No datasets match filter') return es_exp = ESExporter(db, sm_config) for i, ds_id in enumerate(ds_ids): try: logger.info(f'Running off-sample on {i+1} out of {len(ds_ids)}') ds = Dataset.load(db, ds_id) classify_dataset_ion_images(db, ds, sm_config['services'], overwrite_existing) es_exp.reindex_ds(ds_id) except Exception: logger.error(f'Failed to run off-sample on {ds_id}', exc_info=True)
def test_search_job_imzml_example(get_compute_img_measures_mock, filter_sf_metrics_mock, create_fill_sm_database, sm_config): get_compute_img_measures_mock.return_value = lambda *args: (0.9, 0.9, 0.9) filter_sf_metrics_mock.side_effect = lambda x: x SMConfig._config_dict = sm_config db = DB(sm_config['db']) try: job = SearchJob(None, 'imzml_example_ds') job.run(input_dir_path, ds_config_path, clean=True) # dataset meta asserts rows = db.select("SELECT name, file_path, img_bounds from dataset") img_bounds = {u'y': {u'max': 3, u'min': 1}, u'x': {u'max': 3, u'min': 1}} file_path = join(dirname(__file__), 'data', 'imzml_example_ds') assert len(rows) == 1 assert rows[0] == (test_ds_name, file_path, img_bounds) # theoretical patterns asserts rows = db.select('SELECT db_id, sf_id, adduct, centr_mzs, centr_ints, prof_mzs, prof_ints ' 'FROM theor_peaks ' 'ORDER BY adduct') assert len(rows) == 3 + len(DECOY_ADDUCTS) for r in rows: assert r[3] and r[4] # image metrics asserts rows = db.select(('SELECT db_id, sf_id, adduct, peaks_n, stats FROM iso_image_metrics ' 'ORDER BY sf_id, adduct')) assert rows assert rows[0] assert tuple(rows[0][:2]) == (0, 10007) assert set(rows[0][4].keys()) == {'chaos', 'spatial', 'spectral'} # image asserts rows = db.select(('SELECT db_id, sf_id, adduct, peak, intensities, min_int, max_int ' 'FROM iso_image ' 'ORDER BY sf_id, adduct')) assert rows max_int = 0.0 for r in rows: max_int = max(max_int, r[-1]) assert tuple(r[:2]) == (0, 10007) assert max_int finally: db.close() with warn_only(): local('rm -rf {}'.format(data_dir_path))
def test_search_job_imzml_example(get_compute_img_measures_mock, create_fill_sm_database, sm_config): get_compute_img_measures_mock.return_value = lambda *args: (0.9, 0.9, 0.9) SMConfig._config_dict = sm_config db = DB(sm_config['db']) try: job = SearchJob(None, 'imzml_example_ds') job.run(input_dir_path, ds_config_path, clean=True) # dataset meta asserts rows = db.select("SELECT name, file_path, img_bounds from dataset") img_bounds = {u'y': {u'max': 3, u'min': 1}, u'x': {u'max': 3, u'min': 1}} file_path = 'file://' + join(data_dir_path, 'ds.txt') assert len(rows) == 1 assert rows[0] == (test_ds_name, file_path, img_bounds) # theoretical patterns asserts rows = db.select('SELECT db_id, sf_id, adduct, centr_mzs, centr_ints, prof_mzs, prof_ints ' 'FROM theor_peaks ' 'ORDER BY adduct') assert len(rows) == 3 + len(DECOY_ADDUCTS) for r in rows: assert r[3] and r[4] # image metrics asserts rows = db.select(('SELECT db_id, sf_id, adduct, peaks_n, stats FROM iso_image_metrics ' 'ORDER BY sf_id, adduct')) assert rows assert rows[0] assert tuple(rows[0][:2]) == (0, 10007) assert set(rows[0][4].keys()) == {'chaos', 'spatial', 'spectral'} # image asserts rows = db.select(('SELECT db_id, sf_id, adduct, peak, intensities, min_int, max_int ' 'FROM iso_image ' 'ORDER BY sf_id, adduct')) assert rows max_int = 0.0 for r in rows: max_int = max(max_int, r[-1]) assert tuple(r[:2]) == (0, 10007) assert max_int finally: db.close() with warn_only(): local('rm -rf {}'.format(data_dir_path))
def run(sm_config, ds_id_str, sql_where, algorithm, use_lithops): db = DB() if sql_where: ds_ids = [ id for (id, ) in db.select( f'SELECT DISTINCT dataset.id FROM dataset WHERE {sql_where}') ] else: ds_ids = ds_id_str.split(',') if not ds_ids: logger.warning('No datasets match filter') return if use_lithops: executor = Executor(sm_config['lithops']) for i, ds_id in enumerate(ds_ids): try: logger.info( f'[{i+1} / {len(ds_ids)}] Generating ion thumbnail for {ds_id}' ) ds = Dataset.load(db, ds_id) if use_lithops: # noinspection PyUnboundLocalVariable generate_ion_thumbnail_lithops(executor, db, ds, algorithm=algorithm) else: generate_ion_thumbnail(db, ds, algorithm=algorithm) except Exception: logger.error(f'Failed on {ds_id}', exc_info=True)
def reindex_results(sm_config, ds_id, ds_mask, use_inactive_index, offline_reindex, update_fields): assert ds_id or ds_mask or offline_reindex IsocalcWrapper.set_centroids_cache_enabled(True) if offline_reindex: _reindex_all(sm_config) else: es_config = sm_config['elasticsearch'] if use_inactive_index: es_config = get_inactive_index_es_config(es_config) db = DB() es_exp = ESExporter(db, sm_config={ **sm_config, 'elasticsearch': es_config }) if ds_id: ds_ids = ds_id.split(',') elif ds_mask: ds_ids = [ id for (id, ) in db.select( "select id from dataset where name like '{}%'".format( ds_mask)) ] else: ds_ids = [] if update_fields: _partial_update_datasets(ds_ids, es_exp, update_fields.split(',')) else: _reindex_datasets(ds_ids, es_exp)
class SciTester(object): def __init__(self, db_config): self.db = DB(db_config) self.base_search_res_path = join(proj_root(), 'tests/reports', 'spheroid_12h_search_res.csv') self.metrics = ['chaos', 'spatial', 'spectral'] def metr_dict_to_array(self, metr_d): return np.array([metr_d[m] for m in self.metrics]) def read_base_search_res(self): with open(self.base_search_res_path) as f: rows = map(lambda line: line.strip('\n').split('\t'), f.readlines()[1:]) return {(r[0], r[1]): np.array(r[2:], dtype=float) for r in rows} def fetch_search_res(self): rows = self.db.select(SEARCH_RES_SELECT, ds_name, 'HMDB') return {(r[0], r[1]): self.metr_dict_to_array(r[2]) for r in rows} def run_sci_test(self): compare_search_results(self.read_base_search_res(), self.fetch_search_res()) def save_sci_test_report(self): with open(self.base_search_res_path, 'w') as f: f.write('\t'.join(['sf', 'adduct'] + self.metrics) + '\n') for (sf, adduct), metrics in sorted( self.fetch_search_res().iteritems()): f.write('\t'.join([sf, adduct] + metrics.astype(str).tolist()) + '\n') print 'Successfully saved sample dataset search report'
class SciTester(object): def __init__(self, db_config): self.db = DB(db_config) self.base_search_res_path = join(proj_root(), 'tests/reports', 'spheroid_12h_search_res.csv') self.metrics = ['chaos', 'spatial', 'spectral'] def metr_dict_to_array(self, metr_d): return np.array([metr_d[m] for m in self.metrics]) def read_base_search_res(self): with open(self.base_search_res_path) as f: rows = map(lambda line: line.strip('\n').split('\t'), f.readlines()[1:]) return {(r[0], r[1]): np.array(r[2:], dtype=float) for r in rows} def fetch_search_res(self): rows = self.db.select(SEARCH_RES_SELECT, ds_name, 'HMDB') return {(r[0], r[1]): self.metr_dict_to_array(r[2]) for r in rows} def run_sci_test(self): compare_search_results(self.read_base_search_res(), self.fetch_search_res()) def save_sci_test_report(self): with open(self.base_search_res_path, 'w') as f: f.write('\t'.join(['sf', 'adduct'] + self.metrics) + '\n') for (sf, adduct), metrics in sorted(self.fetch_search_res().iteritems()): f.write('\t'.join([sf, adduct] + metrics.astype(str).tolist()) + '\n') print 'Successfully saved sample dataset search report'
def test_save_sf_iso_images_correct_db_call(spark_context, create_fill_sm_database, sm_config, ds_config): sf_iso_imgs = spark_context.parallelize([((1, '+H'), [ csr_matrix([[100, 0, 0], [0, 0, 0]]), csr_matrix([[0, 0, 0], [0, 0, 10]]) ])]) sf_adduct_peaksn = [(1, '+H', 2)] res = SearchResults(0, 0, 0, 'ds_name', sf_adduct_peaksn, db_mock, sm_config, ds_config) res.sf_iso_images = sf_iso_imgs res.nrows, res.ncols = 2, 3 res.store_sf_iso_images() correct_rows = [(0, 0, 1, '+H', 0, [0], [100], 0, 100), (0, 0, 1, '+H', 1, [5], [10], 0, 10)] db = DB(sm_config['db']) try: rows = db.select(( 'SELECT job_id, db_id, sf_id, adduct, peak, pixel_inds, intensities, min_int, max_int ' 'FROM iso_image ' 'ORDER BY sf_id, adduct')) assert correct_rows == rows finally: db.close()
def migrate_optical_images(ds_id): output.print('Migrating optical images') with timeit(): output.print('Transferring images and updating database...') db = DB() rows = db.select(SEL_OPTICAL_IMGS, params=(ds_id,)) for opt_image_id, opt_image_url in rows: if not opt_image_url and opt_image_id: transfer_images( ds_id, 'optical_images', image_storage.OPTICAL, [opt_image_id], ) opt_image_url = image_storage.get_image_url( image_storage.OPTICAL, ds_id, opt_image_id ) db.alter(UPD_OPTICAL_IMGS, params=(opt_image_url, opt_image_id)) opt_thumb_id, opt_thumb_url = db.select_one(SEL_OPT_THUMB, params=(ds_id,)) if not opt_thumb_url and opt_thumb_id: transfer_images( ds_id, 'optical_images', image_storage.OPTICAL, [opt_thumb_id], ) opt_thumb_url = image_storage.get_image_url(image_storage.OPTICAL, ds_id, opt_thumb_id) db.alter(UPD_OPT_THUMB, params=(opt_thumb_url, ds_id))
def update_optical_images(ds_id_str, sql_where): db = DB() if ds_id_str: ds_ids = ds_id_str.split(',') else: ds_ids = [ id for (id, ) in db.select( f'SELECT DISTINCT dataset.id FROM dataset WHERE {sql_where}') ] for i, ds_id in enumerate(ds_ids): try: transform, img_id = db.select_one( 'SELECT transform, optical_image from dataset WHERE id = %s', params=(ds_id, )) if img_id and transform: logger.info( f'[{i + 1}/{len(ds_ids)}] Updating optical image of dataset {ds_id}' ) add_optical_image(db, ds_id, img_id, transform) else: logger.info( f'[{i + 1}/{len(ds_ids)}] Skipping dataset {ds_id}') except Exception: logger.error(f'Failed to update optical image on {ds_id}', exc_info=True)
def test_new_ds_saves_to_db(test_db, metadata, ds_config): db = DB() moldb = create_test_molecular_db() ds_config['database_ids'] = [moldb.id] ds = create_test_ds(config={**ds_config, 'database_ids': [moldb.id]}) ion_metrics_df = pd.DataFrame({ 'formula': ['H2O', 'H2O', 'CO2', 'CO2', 'H2SO4', 'H2SO4'], 'adduct': ['+H', '[M]+', '+H', '[M]+', '+H', '[M]+'], 'fdr': [0.05, 0.1, 0.05, 0.1, 0.05, 0.1], 'image_id': list(map(str, range(6))), }) (job_id, ) = db.insert_return( "INSERT INTO job (moldb_id, ds_id, status) VALUES (%s, %s, 'FINISHED') RETURNING id", rows=[(moldb.id, ds.id)], ) db.insert( 'INSERT INTO annotation(' ' job_id, formula, chem_mod, neutral_loss, adduct, msm, fdr, stats, iso_image_ids' ') ' "VALUES (%s, %s, '', '', %s, 1, %s, '{}', %s)", [(job_id, r.formula, r.adduct, r.fdr, [r.image_id]) for i, r in ion_metrics_df.iterrows()], ) with patch( 'sm.engine.postprocessing.colocalization.ImageStorage.get_ion_images_for_analysis' ) as get_ion_images_for_analysis_mock: get_ion_images_for_analysis_mock.side_effect = mock_get_ion_images_for_analysis Colocalization(db).run_coloc_job(ds) jobs = db.select('SELECT id, error, sample_ion_ids FROM graphql.coloc_job') annotations = db.select( 'SELECT coloc_ion_ids, coloc_coeffs FROM graphql.coloc_annotation') ions = db.select('SELECT id FROM graphql.ion') assert len(jobs) > 0 assert not any(job[1] for job in jobs) assert jobs[0][2] assert len(annotations) > 10 assert all(len(ann[0]) == len(ann[1]) for ann in annotations) assert len(ions) == len(ion_metrics_df)
def reindex_all_results(conf): db = DB(conf['db']) es_exp = ESExporter(conf) es_exp.delete_index(name='sm') es_exp.create_index(name='sm') ds_db_pairs = db.select("select name, config -> 'database'::text -> 'name'::text from dataset") for ds_name, db_name in ds_db_pairs: es_exp.index_ds(db, ds_name, db_name)
def reindex_all_results(conf): db = DB(conf['db']) es_exp = ESExporter(conf) es_exp.delete_index(name='sm') es_exp.create_index(name='sm') ds_db_pairs = db.select( "select name, config -> 'database'::text -> 'name'::text from dataset") for ds_name, db_name in ds_db_pairs: es_exp.index_ds(db, ds_name, db_name)
def test_theor_peaks_generator_run_failed_iso_peaks(create_fill_test_db, spark_context, sm_config, ds_config): ds_config["isotope_generation"]["adducts"] = ["+Na"] theor_peaks_gen = TheorPeaksGenerator(spark_context, sm_config, ds_config) theor_peaks_gen.isocalc_wrapper.isotope_peaks = lambda *args: Centroids([], []) theor_peaks_gen.run() db = DB(sm_config['db']) rows = db.select('SELECT * FROM theor_peaks') assert len(rows) == 1 db.close()
def reindex_results(ds_id, ds_mask): assert ds_id or ds_mask conf = SMConfig.get_conf() if ds_mask == '_all_': _reindex_all(conf) else: db = DB(conf['db']) es_exp = ESExporter(db) if ds_id: rows = db.select( "select id, name, config from dataset where id = '{}'".format( ds_id)) elif ds_mask: rows = db.select( "select id, name, config from dataset where name like '{}%'". format(ds_mask)) else: rows = [] _reindex_datasets(rows, es_exp)
def test_add_optical_image(image_storage_mock, requests_mock, fill_db, metadata, ds_config): image_ids = [ 'opt_img_scaled_id1', 'opt_img_id1', 'opt_img_scaled_id2', 'opt_img_id2', 'opt_img_scaled_id3', 'opt_img_id3', 'thumbnail_id', ] image_storage_mock.post_image.side_effect = image_ids image_storage_mock.get_image_url.return_value = [f'http://{img_id}' for img_id in image_ids] image_storage_mock.get_image.return_value = create_image_bytes() requests_mock.get.return_value = mock.Mock(content=create_image_bytes()) db = DB() ds = create_test_ds() zoom_levels = [1, 2, 3] raw_img_id = 'raw_opt_img_id' add_optical_image( db, ds.id, raw_img_id, [[1, 0, 0], [0, 1, 0], [0, 0, 1]], zoom_levels=zoom_levels ) optical_images = db.select(f"SELECT ds_id, type, zoom FROM optical_image") for type, zoom in itertools.product( [OpticalImageType.SCALED, OpticalImageType.CLIPPED_TO_ION_IMAGE], zoom_levels ): assert (ds.id, type, zoom) in optical_images assert db.select('SELECT optical_image FROM dataset where id = %s', params=(ds.id,)) == [ (raw_img_id,) ] assert db.select('SELECT thumbnail FROM dataset where id = %s', params=(ds.id,)) == [ ('thumbnail_id',) ]
def test_theor_peaks_generator_run_1(create_fill_test_db, spark_context, sm_config, ds_config): ds_config["isotope_generation"]["adducts"] = ["+H", "+Na"] theor_peaks_gen = TheorPeaksGenerator(spark_context, sm_config, ds_config) theor_peaks_gen.isocalc_wrapper.isotope_peaks = lambda *args: Centroids([100., 200.], [10., 1.]) theor_peaks_gen.run() db = DB(sm_config['db']) rows = db.select(('SELECT db_id, sf_id, adduct, sigma, charge, pts_per_mz, centr_mzs, ' 'centr_ints, prof_mzs, prof_ints FROM theor_peaks ORDER BY sf_id, adduct')) assert len(rows) == 2 + 80 assert (filter(lambda r: r[2] == '+H', rows)[0] == (0, 9, '+H', 0.01, 1, 10000, [100., 200.], [10., 1.], [], [])) assert (filter(lambda r: r[2] == '+Na', rows)[0] == (0, 9, '+Na', 0.01, 1, 10000, [100., 200.], [10., 1.], [], [])) db.close()
def find_dataset_ids(ds_ids_param, sql_where, missing, failed, succeeded): db = DB() if ds_ids_param: specified_ds_ids = ds_ids_param.split(',') elif sql_where: specified_ds_ids = db.select_onecol( f"SELECT id FROM dataset WHERE {sql_where}") else: specified_ds_ids = None if not missing: # Default to processing all datasets missing diagnostics missing = specified_ds_ids is None and not failed and not succeeded ds_type_counts = db.select( 'SELECT d.id, COUNT(DISTINCT dd.type), COUNT(dd.error) ' 'FROM dataset d LEFT JOIN dataset_diagnostic dd on d.id = dd.ds_id ' 'WHERE d.status = \'FINISHED\' ' 'GROUP BY d.id') if missing or failed or succeeded: # Get ds_ids based on status (or filter specified ds_ids on status) status_ds_ids = set() for ds_id, n_diagnostics, n_errors in ds_type_counts: if missing and (n_diagnostics or 0) < len(DiagnosticType): status_ds_ids.add(ds_id) elif failed and n_errors > 0: status_ds_ids.add(ds_id) elif succeeded and n_diagnostics == len( DiagnosticType) and n_errors == 0: status_ds_ids.add(ds_id) if specified_ds_ids is not None: # Keep order, if directly specified ds_ids = [ ds_id for ds_id in specified_ds_ids if ds_id in status_ds_ids ] else: # Order by ID descending, so that newer DSs are updated first ds_ids = sorted(status_ds_ids, reverse=True) else: ds_ids = specified_ds_ids assert ds_ids, 'No datasets found' return ds_ids
def _reindex_all(conf): es_config = conf['elasticsearch'] alias = es_config['index'] es_man = ESIndexManager(es_config) new_index = es_man.another_index_name(es_man.internal_index_name(alias)) es_man.create_index(new_index) try: tmp_es_config = deepcopy(es_config) tmp_es_config['index'] = new_index db = DB(conf['db']) es_exp = ESExporter(db, tmp_es_config) rows = db.select('select id, name, config from dataset') _reindex_datasets(rows, es_exp) es_man.remap_alias(tmp_es_config['index'], alias=alias) except Exception as e: es_man.delete_index(new_index) raise e
def run(ds_id, sql_where): conf = SMConfig.get_conf() db = DB(conf['db']) img_store = ImageStoreServiceWrapper(conf['services']['img_service_url']) if sql_where: ds_ids = [ id for (id,) in db.select(f'SELECT DISTINCT dataset.id FROM dataset WHERE {sql_where}') ] else: ds_ids = ds_id.split(',') if not ds_ids: logger.warning('No datasets match filter') return for i, ds_id in enumerate(ds_ids): try: logger.info(f'[{i+1} / {len(ds_ids)}] Updating acq geometry for {ds_id}') ds = Dataset.load(db, ds_id) (sample_img_id,) = db.select_one( "SELECT iim.iso_image_ids[1] from job j " "JOIN iso_image_metrics iim on j.id = iim.job_id " "WHERE j.ds_id = %s LIMIT 1", [ds_id], ) print(sample_img_id) if sample_img_id: w, h = img_store.get_image_by_id('fs', 'iso_image', sample_img_id).size dims = (h, w) # n_cols, n_rows else: dims = (None, None) acq_geometry = make_acq_geometry('ims', None, ds.metadata, dims) ds.save_acq_geometry(db, acq_geometry) except Exception: logger.error(f'Failed on {ds_id}', exc_info=True)
def update_public_database_descriptions(): db = DB() public_db_names = db.select( 'SELECT name FROM molecular_db WHERE is_public = true AND archived = false' ) logger.info(f'Updating public molecular databases: {public_db_names}') for (name, ) in public_db_names: desc = database_descriptions.get(name, None) if desc: db.alter( "UPDATE molecular_db " "SET description = %s, full_name = %s, link = %s, citation = %s " "WHERE name = %s;", params=( desc['description'], desc['full_name'], desc['link'], desc['citation'], name, ), )
def _reindex_all(sm_config): es_config = sm_config['elasticsearch'] alias = es_config['index'] es_man = ESIndexManager(es_config) old_index = es_man.internal_index_name(alias) new_index = es_man.another_index_name(old_index) es_man.create_index(new_index) try: inactive_es_config = get_inactive_index_es_config(es_config) db = DB() es_exp = ESExporter(db, { **sm_config, 'elasticsearch': inactive_es_config }) ds_ids = [r[0] for r in db.select('select id from dataset')] _reindex_datasets(ds_ids, es_exp) es_man.remap_alias(inactive_es_config['index'], alias=alias) except Exception as e: es_man.delete_index(new_index) raise e else: es_man.delete_index(old_index)
def ensure_db_populated(sm_config, analysis_version, database): db = DB() # Install DB schema if needed query = "SELECT COUNT(*) FROM pg_tables WHERE schemaname = 'public' AND tablename = 'dataset'" tables_exist = db.select_one(query)[0] >= 1 if not tables_exist: print('Installing DB schema') db.alter(DB_SQL_SCHEMA) # Import HMDB if needed moldb = MOL_DBS[database] try: molecular_db.find_by_name_version(moldb['name'], moldb['version']) except SMError: print(f'Importing {database}') with TemporaryDirectory() as tmp: urlretrieve(moldb['url'], f'{tmp}/moldb.tsv') molecular_db.create(moldb['name'], moldb['version'], f'{tmp}/moldb.tsv') if analysis_version > 1: if len( db.select( "SELECT name FROM scoring_model WHERE name = 'v3_default'") ) == 0: print("Importing v3_default scoring model") params = upload_catboost_scoring_model( model=Path(proj_root()) / '../scoring-models/v3_default/model-2022-01-05T13-45-26.947188-416b1311.cbm', bucket=sm_config['lithops']['lithops']['storage_bucket'], prefix=f'test_scoring_models/v3_default', is_public=False, ) save_scoring_model_to_db(name='v3_default', type_='catboost', params=params)
def test_search_job_imzml_example(get_compute_img_metrics_mock, filter_sf_metrics_mock, post_images_to_annot_service_mock, MolDBServiceWrapperMock, MolDBServiceWrapperMock2, sm_config, create_fill_sm_database, es_dsl_search, clean_isotope_storage): init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock) init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock2) get_compute_img_metrics_mock.return_value = lambda *args: (0.9, 0.9, 0.9, [100.], [0], [10.]) filter_sf_metrics_mock.side_effect = lambda x: x url_dict = { 'iso_image_ids': ['iso_image_1', None, None, None] } post_images_to_annot_service_mock.return_value = { 35: url_dict, 44: url_dict } db = DB(sm_config['db']) try: ds_config_str = open(ds_config_path).read() upload_dt = datetime.now() ds_id = '2000-01-01_00h00m' db.insert(Dataset.DS_INSERT, [{ 'id': ds_id, 'name': test_ds_name, 'input_path': input_dir_path, 'upload_dt': upload_dt, 'metadata': '{}', 'config': ds_config_str, 'status': DatasetStatus.QUEUED, 'is_public': True, 'mol_dbs': ['HMDB-v4'], 'adducts': ['+H', '+Na', '+K'], 'ion_img_storage': 'fs' }]) img_store = ImageStoreServiceWrapper(sm_config['services']['img_service_url']) job = SearchJob(img_store=img_store) job._sm_config['rabbitmq'] = {} # avoid talking to RabbitMQ during the test ds = Dataset.load(db, ds_id) job.run(ds) # dataset table asserts rows = db.select('SELECT id, name, input_path, upload_dt, status from dataset') input_path = join(dirname(__file__), 'data', test_ds_name) assert len(rows) == 1 assert rows[0] == (ds_id, test_ds_name, input_path, upload_dt, DatasetStatus.FINISHED) # ms acquisition geometry asserts rows = db.select('SELECT acq_geometry from dataset') assert len(rows) == 1 assert rows[0][0] == ds.get_acq_geometry(db) assert rows[0][0] == { ACQ_GEOMETRY_KEYS.LENGTH_UNIT: 'nm', ACQ_GEOMETRY_KEYS.AcqGridSection.section_name: { ACQ_GEOMETRY_KEYS.AcqGridSection.REGULAR_GRID: True, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_X : 3, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_Y : 3, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_X : 100, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_Y : 100 }, ACQ_GEOMETRY_KEYS.PixelSizeSection.section_name: { ACQ_GEOMETRY_KEYS.PixelSizeSection.REGULAR_SIZE: True, ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_X : 100, ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_Y : 100 } } # job table asserts rows = db.select('SELECT db_id, ds_id, status, start, finish from job') assert len(rows) == 1 db_id, ds_id, status, start, finish = rows[0] assert (db_id, ds_id, status) == (0, '2000-01-01_00h00m', 'FINISHED') assert start < finish # image metrics asserts rows = db.select(('SELECT db_id, sf, adduct, stats, iso_image_ids ' 'FROM iso_image_metrics ' 'ORDER BY sf, adduct')) assert rows[0] == (0, 'C12H24O', '+K', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9, 'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]}, ['iso_image_1', None, None, None]) assert rows[1] == (0, 'C12H24O', '+Na', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9, 'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]}, ['iso_image_1', None, None, None]) time.sleep(1) # Waiting for ES # ES asserts ds_docs = es_dsl_search.query('term', _type='dataset').execute().to_dict()['hits']['hits'] assert 1 == len(ds_docs) ann_docs = es_dsl_search.query('term', _type='annotation').execute().to_dict()['hits']['hits'] assert len(ann_docs) == len(rows) for doc in ann_docs: assert doc['_id'].startswith(ds_id) finally: db.close() with warn_only(): local('rm -rf {}'.format(data_dir_path))
# 'WHERE d.name = %s ' # 'ORDER BY t.target_add, t.sf_id') EXPORT_SEL = ('SELECT adds.sf_id, adds.target_add, f.sf, adds.decoy_add ' 'FROM target_decoy_add adds ' 'JOIN agg_formula f ON f.id = adds.sf_id ' 'JOIN job j ON j.id = adds.job_id ' 'JOIN dataset ds ON ds.id = j.ds_id AND adds.db_id = f.db_id' 'WHERE ds.name = %s ' 'ORDER BY adds.target_add, adds.sf_id') if __name__ == '__main__': parser = argparse.ArgumentParser(description='Exporting target/decoy sets into a csv file') parser.add_argument('ds_name', type=str, help='Dataset name') parser.add_argument('csv_path', type=str, help='Path for the csv file') parser.add_argument('--config', dest='sm_config_path', type=str, help='SM config path') parser.set_defaults(sm_config_path=path.join(proj_root(), 'conf/config.json')) args = parser.parse_args() SMConfig.set_path(args.sm_config_path) db = DB(SMConfig.get_conf()['db']) export_rs = db.select(EXPORT_SEL, args.ds_name) header = ','.join(['sf_id', 'target_add', 'sf', 'decoy_add']) + '\n' with open(args.csv_path, 'w') as f: f.write(header) f.writelines([','.join(map(str, row)) + '\n' for row in export_rs]) logger.info('Exported all search results for "%s" dataset into "%s" file', args.ds_name, args.csv_path)
class SciTester(object): def __init__(self, sm_config_path): self.sm_config_path = sm_config_path self.sm_config = SMConfig.get_conf() self.db = DB(self.sm_config['db']) self.ds_id = '2000-01-01-00_00_00' self.base_search_res_path = join(proj_root(), 'tests/reports', 'spheroid_untreated_search_res.csv') self.ds_name = 'sci_test_spheroid_untreated' self.data_dir_path = join(self.sm_config['fs']['base_path'], self.ds_name) self.input_path = join(proj_root(), 'tests/data/untreated') self.ds_config_path = join(self.input_path, 'config.json') self.metrics = ['chaos', 'spatial', 'spectral'] def metr_dict_to_array(self, metr_d): metric_array = np.array([metr_d[m] for m in self.metrics]) return np.hstack([metric_array, metric_array.prod()]) def read_base_search_res(self): def prep_metric_arrays(a): return np.array(a, dtype=float) with open(self.base_search_res_path) as f: rows = map(lambda line: line.strip('\n').split('\t'), f.readlines()[1:]) return {(r[0], r[1]): prep_metric_arrays(r[2:]) for r in rows} def fetch_search_res(self): mol_db_service = MolDBServiceWrapper( self.sm_config['services']['mol_db']) mol_db_id = mol_db_service.find_db_by_name_version( 'HMDB-v2.5')[0]['id'] rows = self.db.select(SEARCH_RES_SELECT, params=(mol_db_id, self.ds_name)) return {(r[0], r[1]): self.metr_dict_to_array(r[2]) for r in rows} def save_sci_test_report(self): with open(self.base_search_res_path, 'w') as f: f.write('\t'.join(['sf', 'adduct'] + self.metrics) + '\n') for (sf, adduct), metrics in sorted(self.fetch_search_res().items()): f.write('\t'.join([sf, adduct] + metrics.astype(str).tolist()) + '\n') print('Successfully saved sample dataset search report') @staticmethod def print_metric_hist(metric_arr, bins=10): metric_freq, metric_interv = np.histogram(metric_arr, bins=np.linspace(-1, 1, 21)) metric_interv = [round(x, 2) for x in metric_interv] pprint( list(zip(zip(metric_interv[:-1], metric_interv[1:]), metric_freq))) def report_metric_differences(self, metrics_array): metrics_array = np.array(metrics_array) print("\nCHAOS HISTOGRAM") self.print_metric_hist(metrics_array[:, 0]) print("\nIMG_CORR HISTOGRAM") self.print_metric_hist(metrics_array[:, 1]) print("\nPAT_MATCH HISTOGRAM") self.print_metric_hist(metrics_array[:, 2]) print("\nMSM HISTOGRAM") self.print_metric_hist(metrics_array[:, 3]) def _missed_formulas(self, old, new): missed_sf_adduct = set(old.keys()) - set(new.keys()) print('MISSED FORMULAS: {:.1f}%'.format( len(missed_sf_adduct) / len(old) * 100)) if missed_sf_adduct: missed_sf_base_metrics = np.array( [old[k] for k in missed_sf_adduct]) self.report_metric_differences(missed_sf_base_metrics) return bool(missed_sf_adduct) def _false_discovery(self, old, new): new_sf_adduct = set(new.keys()) - set(old.keys()) print('\nFALSE DISCOVERY: {:.1f}%'.format( len(new_sf_adduct) / len(old) * 100)) if new_sf_adduct: for sf_adduct in new_sf_adduct: metrics = new[sf_adduct] print('{} metrics = {}'.format(sf_adduct, metrics)) return bool(new_sf_adduct) def _metrics_diff(self, old, new): print('\nDIFFERENCE IN METRICS:') metric_diffs = [] for b_sf_add, b_metr in old.items(): if b_sf_add in new.keys(): metr = new[b_sf_add] diff = b_metr - metr if np.any(np.abs(diff) > 1e-6): metric_diffs.append(diff) print('{} metrics diff = {}'.format(b_sf_add, diff)) if metric_diffs: self.report_metric_differences(metric_diffs) return bool(metric_diffs) def search_results_are_different(self): old_search_res = self.read_base_search_res() search_res = self.fetch_search_res() return (self._missed_formulas(old_search_res, search_res) or self._false_discovery(old_search_res, search_res) or self._metrics_diff(old_search_res, search_res)) def _create_img_store_mock(self): class ImageStoreMock(object): def post_image(self, *args): return None def delete_image_by_id(self, *args): return None return ImageStoreMock() def run_search(self, mock_img_store=False): if mock_img_store: img_store = self._create_img_store_mock() else: img_store = ImageStoreServiceWrapper( self.sm_config['services']['img_service_url']) manager = SMDaemonManager(db=self.db, es=ESExporter(self.db), img_store=img_store) ds = create_ds_from_files(self.ds_id, self.ds_name, self.input_path) from sm.engine.search_job import SearchJob manager.annotate(ds, search_job_factory=SearchJob, del_first=True) def clear_data_dirs(self): with warn_only(): local('rm -rf {}'.format(self.data_dir_path))
if __name__ == "__main__": parser = argparse.ArgumentParser( description='Exporting search results into a csv file') parser.add_argument('ds_name', type=str, help='Dataset name') parser.add_argument('csv_path', type=str, help='Path for the csv file') parser.add_argument('--config', dest='sm_config_path', type=str, help='SM config path') parser.set_defaults( sm_config_path=path.join(proj_root(), 'conf/config.json')) args = parser.parse_args() SMConfig.set_path(args.sm_config_path) db = DB(SMConfig.get_conf()['db']) ds_config = db.select_one(DS_CONFIG_SEL, args.ds_name)[0] isotope_gen_config = ds_config['isotope_generation'] charge = '{}{}'.format(isotope_gen_config['charge']['polarity'], isotope_gen_config['charge']['n_charges']) export_rs = db.select(EXPORT_SEL, ds_config['database']['name'], args.ds_name, isotope_gen_config['isocalc_sigma'], charge, isotope_gen_config['isocalc_pts_per_mz']) header = '\t'.join(['formula_db', 'db_ids', 'sf_name', 'sf', 'adduct']) +'\t' + '\t'.join(metrics) + '\t' + \ '\t'.join(['fdr', 'isocalc_sigma', 'isocalc_charge', 'isocalc_pts_per_mz', 'first_peak_mz']) + '\n' with open(args.csv_path, 'w') as f: f.write(header) f.writelines(['\t'.join(map(str, row)) + '\n' for row in export_rs]) logger.info('Exported all search results for "%s" dataset into "%s" file', args.ds_name, args.csv_path)
class SciTester(object): def __init__(self, sm_config_path): self.sm_config_path = sm_config_path self.sm_config = SMConfig.get_conf() self.db = DB(self.sm_config['db']) self.ds_id = '2000-01-01-00_00_00' self.base_search_res_path = join(proj_root(), 'tests/reports', 'spheroid_untreated_search_res.csv') self.ds_name = 'sci_test_spheroid_untreated' self.data_dir_path = join(self.sm_config['fs']['base_path'], self.ds_name) self.input_path = join(proj_root(), 'tests/data/untreated') self.ds_config_path = join(self.input_path, 'config.json') self.metrics = ['chaos', 'spatial', 'spectral'] def metr_dict_to_array(self, metr_d): metric_array = np.array([metr_d[m] for m in self.metrics]) return np.hstack([metric_array, metric_array.prod()]) def read_base_search_res(self): def prep_metric_arrays(a): return np.array(a, dtype=float) with open(self.base_search_res_path) as f: rows = map(lambda line: line.strip('\n').split('\t'), f.readlines()[1:]) return {(r[0], r[1]): prep_metric_arrays(r[2:]) for r in rows} def fetch_search_res(self): mol_db_service = MolDBServiceWrapper(self.sm_config['services']['mol_db']) mol_db_id = mol_db_service.find_db_by_name_version('HMDB-v2.5')[0]['id'] rows = self.db.select(SEARCH_RES_SELECT, params=(mol_db_id, self.ds_name)) return {(r[0], r[1]): self.metr_dict_to_array(r[2]) for r in rows} def save_sci_test_report(self): with open(self.base_search_res_path, 'w') as f: f.write('\t'.join(['sf', 'adduct'] + self.metrics) + '\n') for (sf, adduct), metrics in sorted(self.fetch_search_res().items()): f.write('\t'.join([sf, adduct] + metrics.astype(str).tolist()) + '\n') print('Successfully saved sample dataset search report') @staticmethod def print_metric_hist(metric_arr, bins=10): metric_freq, metric_interv = np.histogram(metric_arr, bins=np.linspace(-1, 1, 21)) metric_interv = [round(x, 2) for x in metric_interv] pprint(list(zip(zip(metric_interv[:-1], metric_interv[1:]), metric_freq))) def report_metric_differences(self, metrics_array): metrics_array = np.array(metrics_array) print("\nCHAOS HISTOGRAM") self.print_metric_hist(metrics_array[:, 0]) print("\nIMG_CORR HISTOGRAM") self.print_metric_hist(metrics_array[:, 1]) print("\nPAT_MATCH HISTOGRAM") self.print_metric_hist(metrics_array[:, 2]) print("\nMSM HISTOGRAM") self.print_metric_hist(metrics_array[:, 3]) def _missed_formulas(self, old, new): missed_sf_adduct = set(old.keys()) - set(new.keys()) print('MISSED FORMULAS: {:.1f}%'.format(len(missed_sf_adduct) / len(old) * 100)) if missed_sf_adduct: missed_sf_base_metrics = np.array([old[k] for k in missed_sf_adduct]) self.report_metric_differences(missed_sf_base_metrics) return bool(missed_sf_adduct) def _false_discovery(self, old, new): new_sf_adduct = set(new.keys()) - set(old.keys()) print('\nFALSE DISCOVERY: {:.1f}%'.format(len(new_sf_adduct) / len(old) * 100)) if new_sf_adduct: for sf_adduct in new_sf_adduct: metrics = new[sf_adduct] print('{} metrics = {}'.format(sf_adduct, metrics)) return bool(new_sf_adduct) def _metrics_diff(self, old, new): print('\nDIFFERENCE IN METRICS:') metric_diffs = [] for b_sf_add, b_metr in old.items(): if b_sf_add in new.keys(): metr = new[b_sf_add] diff = b_metr - metr if np.any(np.abs(diff) > 1e-6): metric_diffs.append(diff) print('{} metrics diff = {}'.format(b_sf_add, diff)) if metric_diffs: self.report_metric_differences(metric_diffs) return bool(metric_diffs) def search_results_are_different(self): old_search_res = self.read_base_search_res() search_res = self.fetch_search_res() return (self._missed_formulas(old_search_res, search_res) or self._false_discovery(old_search_res, search_res) or self._metrics_diff(old_search_res, search_res)) def _create_img_store_mock(self): class ImageStoreMock(object): def post_image(self, *args): return None def delete_image_by_id(self, *args): return None return ImageStoreMock() def run_search(self, mock_img_store=False): if mock_img_store: img_store = self._create_img_store_mock() else: img_store = ImageStoreServiceWrapper(self.sm_config['services']['img_service_url']) ds_man = SMDaemonDatasetManager(db=self.db, es=ESExporter(self.db), img_store=img_store, mode='local') ds = create_ds_from_files(self.ds_id, self.ds_name, self.input_path) from sm.engine import SearchJob ds_man.add(ds, search_job_factory=SearchJob, del_first=True) def clear_data_dirs(self): with warn_only(): local('rm -rf {}'.format(self.data_dir_path))
"SELECT f.sf, t.adduct, t.centr_mzs, t.centr_ints " "FROM public.agg_formula f, public.theor_peaks t " "WHERE t.sf_id = f.id AND f.db_id = 1 AND f.sf = %s AND t.adduct = %s " # hardcoded to always fetch from HMDB, lazy i know "ORDER BY t.adduct;") if __name__ == "__main__": parser = argparse.ArgumentParser(description='Exporting isotopic images') parser.add_argument('sf', type=str, help='sum formula') parser.add_argument('add', type=str, help='adduct') parser.add_argument('pkl_path', type=str, help='Path for the cPickle file') parser.add_argument('--config', dest='sm_config_path', type=str, help='SM config path') parser.set_defaults( sm_config_path=path.join(proj_root(), 'conf/config.json')) args = parser.parse_args() SMConfig.set_path(args.sm_config_path) db = DB(SMConfig.get_conf()['db']) export_rs = db.select(EXPORT_SEL, args.sf, args.add) export_df = pd.DataFrame( export_rs, columns=['sf', 'adduct', 'centr_mzs', 'centr_ints']) export_df.to_csv(args.pkl_path, index=False) logger.info( 'Exported the spectra for the "%s" sum formula, "%s" adduct into "%s" file', args.sf, args.add, args.pkl_path)
class TheorPeaksGenerator(object): """ Generator of theoretical isotope peaks for all molecules in a database. Args ---------- sc : pyspark.SparkContext sm_config : dict SM engine config ds_config : dict Dataset config """ def __init__(self, sc, sm_config, ds_config): self.sc = sc self.sm_config = sm_config self.ds_config = ds_config self.theor_peaks_tmp_dir = join(sm_config['fs']['base_path'], 'tmp_theor_peaks_gen') self.db = DB(sm_config['db']) self.adducts = self.ds_config['isotope_generation']['adducts'] self.isocalc_wrapper = IsocalcWrapper( self.ds_config['isotope_generation']) @staticmethod def _sf_elements(sf): return [ seg.element().name() for seg in parseSumFormula(sf).get_segments() ] @classmethod def _valid_sf_adduct(cls, sf, adduct): if sf is None or adduct is None or sf == 'None' or adduct == 'None': logger.warning('Invalid sum formula or adduct: sf=%s, adduct=%s', sf, adduct) return False if '-' in adduct and adduct.strip('-') not in cls._sf_elements(sf): logger.info( 'No negative adduct element in the sum formula: sf=%s, adduct=%s', sf, adduct) return False return True def run(self): """ Starts peaks generation. Checks all formula peaks saved in the database and generates peaks only for new ones""" logger.info('Running theoretical peaks generation') db_id = self.db.select_one(DB_ID_SEL, self.ds_config['database']['name'])[0] formula_list = self.apply_database_filters( self.db.select(AGG_FORMULA_SEL, db_id)) stored_sf_adduct = self.db.select(SF_ADDUCT_SEL, db_id, self.isocalc_wrapper.sigma, self.isocalc_wrapper.charge, self.isocalc_wrapper.pts_per_mz) sf_adduct_cand = self.find_sf_adduct_cand(formula_list, set(stored_sf_adduct)) logger.info('%d saved (sf, adduct)s, %s not saved (sf, adduct)s', len(stored_sf_adduct), len(sf_adduct_cand)) if sf_adduct_cand: self.generate_theor_peaks(sf_adduct_cand) def apply_database_filters(self, formula_list): """ Filters according to settings in dataset config Args ---- formula_list : list List of pairs (id, sum formula) to search through Returns ------- : list Filtered list of pairs (id, sum formula) """ if 'organic' in map(lambda s: s.lower(), self.ds_config['database'].get('filters', [])): logger.info('Organic sum formula filter has been applied') return filter(lambda (_, sf): 'C' in self._sf_elements(sf), formula_list) return formula_list def find_sf_adduct_cand(self, formula_list, stored_sf_adduct): """ Args ---- formula_list : list List of pairs (id, sum formula) to search through stored_sf_adduct : set Set of (formula, adduct) pairs which have theoretical patterns saved in the database Returns ------- : list List of (formula id, formula, adduct) triples which don't have theoretical patterns saved in the database """ assert formula_list, 'Emtpy agg_formula table!' adducts = set(self.adducts) | set(DECOY_ADDUCTS) cand = [(id, sf, a) for (id, sf) in formula_list for a in adducts] return filter( lambda (sf_id, sf, adduct): (sf, adduct) not in stored_sf_adduct, cand) def generate_theor_peaks(self, sf_adduct_cand): """ Args ---- sf_adduct_cand : list List of (formula id, formula, adduct) triples which don't have theoretical patterns saved in the database Returns ------- : list List of strings with formatted theoretical peaks data """ logger.info('Generating missing peaks') formatted_iso_peaks = self.isocalc_wrapper.formatted_iso_peaks db_id = self.db.select_one(DB_ID_SEL, self.ds_config['database']['name'])[0] n = 10000 for i in xrange(0, len(sf_adduct_cand), n): sf_adduct_cand_rdd = self.sc.parallelize(sf_adduct_cand[i:i + n], numSlices=128) peak_lines = (sf_adduct_cand_rdd.flatMap( lambda (sf_id, sf, adduct): formatted_iso_peaks( db_id, sf_id, sf, adduct)).collect()) self._import_theor_peaks_to_db(peak_lines) def _import_theor_peaks_to_db(self, peak_lines): logger.info('Saving new peaks to the DB') if not exists(self.theor_peaks_tmp_dir): makedirs(self.theor_peaks_tmp_dir) peak_lines_path = join(self.theor_peaks_tmp_dir, 'peak_lines.csv') with open(peak_lines_path, 'w') as f: f.write('\n'.join(peak_lines)) with open(peak_lines_path) as peaks_file: self.db.copy(peaks_file, 'theor_peaks')
class SearchJob(object): """ Main class responsible for molecule search. Uses other modules of the engine. Args ---- no_clean : bool Don't delete interim data files """ def __init__(self, img_store=None, no_clean=False): self.no_clean = no_clean self._img_store = img_store self._job_id = None self._sc = None self._db = None self._ds = None self._ds_reader = None self._status_queue = None self._fdr = None self._wd_manager = None self._es = None self._sm_config = SMConfig.get_conf() logger.debug('Using SM config:\n%s', pformat(self._sm_config)) def _configure_spark(self): logger.info('Configuring Spark') sconf = SparkConf() for prop, value in self._sm_config['spark'].items(): if prop.startswith('spark.'): sconf.set(prop, value) if 'aws' in self._sm_config: sconf.set("spark.hadoop.fs.s3a.access.key", self._sm_config['aws']['aws_access_key_id']) sconf.set("spark.hadoop.fs.s3a.secret.key", self._sm_config['aws']['aws_secret_access_key']) sconf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") sconf.set("spark.hadoop.fs.s3a.endpoint", "s3.{}.amazonaws.com".format(self._sm_config['aws']['aws_region'])) self._sc = SparkContext(master=self._sm_config['spark']['master'], conf=sconf, appName='SM engine') def _init_db(self): logger.info('Connecting to the DB') self._db = DB(self._sm_config['db']) def store_job_meta(self, mol_db_id): """ Store search job metadata in the database """ logger.info('Storing job metadata') rows = [(mol_db_id, self._ds.id, 'STARTED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))] self._job_id = self._db.insert_return(JOB_INS, rows=rows)[0] def _run_annotation_job(self, mol_db): try: self.store_job_meta(mol_db.id) mol_db.set_job_id(self._job_id) logger.info("Running new job ds_id: %s, ds_name: %s, db_name: %s, db_version: %s", self._ds.id, self._ds.name, mol_db.name, mol_db.version) target_adducts = self._ds.config['isotope_generation']['adducts'] self._fdr = FDR(job_id=self._job_id, decoy_sample_size=20, target_adducts=target_adducts, db=self._db) isocalc = IsocalcWrapper(self._ds.config['isotope_generation']) centroids_gen = IonCentroidsGenerator(sc=self._sc, moldb_name=mol_db.name, isocalc=isocalc) polarity = self._ds.config['isotope_generation']['charge']['polarity'] all_adducts = list(set(self._sm_config['defaults']['adducts'][polarity]) | set(DECOY_ADDUCTS)) centroids_gen.generate_if_not_exist(isocalc=isocalc, sfs=mol_db.sfs, adducts=all_adducts) target_ions = centroids_gen.ions(target_adducts) self._fdr.decoy_adducts_selection(target_ions) search_alg = MSMBasicSearch(sc=self._sc, ds=self._ds, ds_reader=self._ds_reader, mol_db=mol_db, centr_gen=centroids_gen, fdr=self._fdr, ds_config=self._ds.config) ion_metrics_df, ion_iso_images = search_alg.search() search_results = SearchResults(mol_db.id, self._job_id, search_alg.metrics.keys()) mask = self._ds_reader.get_2d_sample_area_mask() img_store_type = self._ds.get_ion_img_storage_type(self._db) search_results.store(ion_metrics_df, ion_iso_images, mask, self._db, self._img_store, img_store_type) except Exception as e: self._db.alter(JOB_UPD, params=('FAILED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id)) msg = 'Job failed(ds_id={}, mol_db={}): {}'.format(self._ds.id, mol_db, str(e)) raise JobFailedError(msg) from e else: self._export_search_results_to_es(mol_db, isocalc) def _export_search_results_to_es(self, mol_db, isocalc): try: self._es.index_ds(self._ds.id, mol_db, isocalc) except Exception as e: self._db.alter(JOB_UPD, params=('FAILED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id)) msg = 'Export to ES failed(ds_id={}, mol_db={}): {}'.format(self._ds.id, mol_db, str(e)) raise ESExportFailedError(msg) from e else: self._db.alter(JOB_UPD, params=('FINISHED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id)) def _remove_annotation_job(self, mol_db): logger.info("Removing job results ds_id: %s, ds_name: %s, db_name: %s, db_version: %s", self._ds.id, self._ds.name, mol_db.name, mol_db.version) self._db.alter('DELETE FROM job WHERE ds_id = %s and db_id = %s', params=(self._ds.id, mol_db.id)) self._es.delete_ds(self._ds.id, mol_db) def _moldb_ids(self): moldb_service = MolDBServiceWrapper(self._sm_config['services']['mol_db']) completed_moldb_ids = {moldb_service.find_db_by_id(db_id)['id'] for (_, db_id) in self._db.select(JOB_ID_MOLDB_ID_SEL, params=(self._ds.id,))} new_moldb_ids = {moldb_service.find_db_by_name_version(moldb_name)[0]['id'] for moldb_name in self._ds.config['databases']} return completed_moldb_ids, new_moldb_ids def _save_data_from_raw_ms_file(self): ms_file_type_config = SMConfig.get_ms_file_handler(self._wd_manager.local_dir.ms_file_path) acq_geometry_factory_module = ms_file_type_config['acq_geometry_factory'] acq_geometry_factory = getattr(import_module(acq_geometry_factory_module['path']), acq_geometry_factory_module['name']) acq_geometry = acq_geometry_factory(self._wd_manager.local_dir.ms_file_path).create() self._ds.save_acq_geometry(self._db, acq_geometry) self._ds.save_ion_img_storage_type(self._db, ms_file_type_config['img_storage_type']) def run(self, ds): """ Entry point of the engine. Molecule search is completed in several steps: * Copying input data to the engine work dir * Conversion input mass spec files to plain text format. One line - one spectrum data * Generation and saving to the database theoretical peaks for all formulas from the molecule database * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner. * Saving results (isotope images and their metrics of quality for each putative molecule) to the database Args ---- ds : sm.engine.dataset_manager.Dataset """ try: start = time.time() self._init_db() self._es = ESExporter(self._db) self._ds = ds if self._sm_config['rabbitmq']: self._status_queue = QueuePublisher(config=self._sm_config['rabbitmq'], qdesc=SM_DS_STATUS, logger=logger) else: self._status_queue = None ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.STARTED) self._wd_manager = WorkDirManager(ds.id) self._configure_spark() if not self.no_clean: self._wd_manager.clean() self._ds_reader = DatasetReader(self._ds.input_path, self._sc, self._wd_manager) self._ds_reader.copy_convert_input_data() self._save_data_from_raw_ms_file() self._img_store.storage_type = self._ds.get_ion_img_storage_type(self._db) ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.STARTED) logger.info('Dataset config:\n%s', pformat(self._ds.config)) completed_moldb_ids, new_moldb_ids = self._moldb_ids() for moldb_id in completed_moldb_ids.symmetric_difference(new_moldb_ids): # ignore ids present in both sets mol_db = MolecularDB(id=moldb_id, db=self._db, iso_gen_config=self._ds.config['isotope_generation']) if moldb_id not in new_moldb_ids: self._remove_annotation_job(mol_db) elif moldb_id not in completed_moldb_ids: self._run_annotation_job(mol_db) ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.FINISHED) logger.info("All done!") time_spent = time.time() - start logger.info('Time spent: %d mins %d secs', *divmod(int(round(time_spent)), 60)) except Exception as e: if self._ds: ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.FAILED) logger.error(e, exc_info=True) raise finally: if self._sc: self._sc.stop() if self._db: self._db.close() if self._wd_manager and not self.no_clean: self._wd_manager.clean() logger.info('*' * 150)
"JOIN agg_formula f ON f.id = m.sf_id AND sf_db.id = f.db_id " "JOIN job j ON j.id = m.job_id " "JOIN dataset ds ON ds.id = j.ds_id " "JOIN theor_peaks tp ON tp.db_id = sf_db.id AND tp.sf_id = m.sf_id AND tp.adduct = m.adduct " "WHERE sf_db.name = %s AND ds.name = %s " "AND ROUND(sigma::numeric, 6) = %s AND charge = %s AND pts_per_mz = %s") if __name__ == "__main__": parser = argparse.ArgumentParser(description='Exporting search results into a csv file') parser.add_argument('ds_name', type=str, help='Dataset name') parser.add_argument('csv_path', type=str, help='Path for the csv file') parser.add_argument('--config', dest='sm_config_path', type=str, help='SM config path') parser.set_defaults(sm_config_path=path.join(proj_root(), 'conf/config.json')) args = parser.parse_args() SMConfig.set_path(args.sm_config_path) db = DB(SMConfig.get_conf()['db']) ds_config = db.select_one(DS_CONFIG_SEL, args.ds_name)[0] isotope_gen_config = ds_config['isotope_generation'] charge = '{}{}'.format(isotope_gen_config['charge']['polarity'], isotope_gen_config['charge']['n_charges']) export_rs = db.select(EXPORT_SEL, ds_config['database']['name'], args.ds_name, isotope_gen_config['isocalc_sigma'], charge, isotope_gen_config['isocalc_pts_per_mz']) header = ','.join(['formula_db', 'ds_name', 'sf', 'adduct', 'chaos', 'img_corr', 'pat_match', 'isocalc_sigma', 'isocalc_charge', 'isocalc_pts_per_mz', 'first_peak_mz']) + '\n' with open(args.csv_path, 'w') as f: f.write(header) f.writelines([','.join(map(str, row)) + '\n' for row in export_rs]) logger.info('Exported all search results for "%s" dataset into "%s" file', args.ds_name, args.csv_path)
def test_sm_daemons(get_compute_img_metrics_mock, filter_sf_metrics_mock, post_images_to_annot_service_mock, MolDBServiceWrapperMock, sm_config, test_db, es_dsl_search, clean_isotope_storage): init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock) get_compute_img_metrics_mock.return_value = lambda *args: (0.9, 0.9, 0.9, [100.], [0], [10.]) filter_sf_metrics_mock.side_effect = lambda x: x url_dict = { 'iso_image_ids': ['iso_image_1', None, None, None] } post_images_to_annot_service_mock.return_value = { 35: url_dict, 44: url_dict } db = DB(sm_config['db']) es = ESExporter(db) annotate_daemon = None update_daemon = None try: ds_config_str = open(ds_config_path).read() upload_dt = datetime.now() ds_id = '2000-01-01_00h00m' db.insert(Dataset.DS_INSERT, [{ 'id': ds_id, 'name': test_ds_name, 'input_path': input_dir_path, 'upload_dt': upload_dt, 'metadata': '{"Data_Type": "Imaging MS"}', 'config': ds_config_str, 'status': DatasetStatus.QUEUED, 'is_public': True, 'mol_dbs': ['HMDB-v4'], 'adducts': ['+H', '+Na', '+K'], 'ion_img_storage': 'fs' }]) ds = Dataset.load(db, ds_id) queue_pub.publish({'ds_id': ds.id, 'ds_name': ds.name, 'action': 'annotate'}) run_daemons(db, es) # dataset table asserts rows = db.select('SELECT id, name, input_path, upload_dt, status from dataset') input_path = join(dirname(__file__), 'data', test_ds_name) assert len(rows) == 1 assert rows[0] == (ds_id, test_ds_name, input_path, upload_dt, DatasetStatus.FINISHED) # ms acquisition geometry asserts rows = db.select('SELECT acq_geometry from dataset') assert len(rows) == 1 assert rows[0][0] == ds.get_acq_geometry(db) assert rows[0][0] == { ACQ_GEOMETRY_KEYS.LENGTH_UNIT: 'nm', ACQ_GEOMETRY_KEYS.AcqGridSection.section_name: { ACQ_GEOMETRY_KEYS.AcqGridSection.REGULAR_GRID: True, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_X : 3, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_Y : 3, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_X : 100, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_Y : 100 }, ACQ_GEOMETRY_KEYS.PixelSizeSection.section_name: { ACQ_GEOMETRY_KEYS.PixelSizeSection.REGULAR_SIZE: True, ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_X : 100, ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_Y : 100 } } # job table asserts rows = db.select('SELECT db_id, ds_id, status, start, finish from job') assert len(rows) == 1 db_id, ds_id, status, start, finish = rows[0] assert (db_id, ds_id, status) == (0, '2000-01-01_00h00m', JobStatus.FINISHED) assert start < finish # image metrics asserts rows = db.select(('SELECT db_id, sf, adduct, stats, iso_image_ids ' 'FROM iso_image_metrics ' 'ORDER BY sf, adduct')) assert rows[0] == (0, 'C12H24O', '+K', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9, 'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]}, ['iso_image_1', None, None, None]) assert rows[1] == (0, 'C12H24O', '+Na', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9, 'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]}, ['iso_image_1', None, None, None]) time.sleep(1) # Waiting for ES # ES asserts ds_docs = es_dsl_search.query('term', _type='dataset').execute().to_dict()['hits']['hits'] assert 1 == len(ds_docs) ann_docs = es_dsl_search.query('term', _type='annotation').execute().to_dict()['hits']['hits'] assert len(ann_docs) == len(rows) for doc in ann_docs: assert doc['_id'].startswith(ds_id) finally: db.close() if annotate_daemon: annotate_daemon.stop() if update_daemon: update_daemon.stop() with warn_only(): local('rm -rf {}'.format(data_dir_path))
class SearchJob(object): """ Main class responsible for molecule search. Uses other modules of the engine. Args ---- no_clean : bool Don't delete interim data files """ def __init__(self, img_store=None, no_clean=False): self.no_clean = no_clean self._img_store = img_store self._job_id = None self._sc = None self._db = None self._ds = None self._ds_reader = None self._status_queue = None self._fdr = None self._wd_manager = None self._es = None self._sm_config = SMConfig.get_conf() logger.debug('Using SM config:\n%s', pformat(self._sm_config)) def _configure_spark(self): logger.info('Configuring Spark') sconf = SparkConf() for prop, value in self._sm_config['spark'].items(): if prop.startswith('spark.'): sconf.set(prop, value) if 'aws' in self._sm_config: sconf.set("spark.hadoop.fs.s3a.access.key", self._sm_config['aws']['aws_access_key_id']) sconf.set("spark.hadoop.fs.s3a.secret.key", self._sm_config['aws']['aws_secret_access_key']) sconf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") sconf.set( "spark.hadoop.fs.s3a.endpoint", "s3.{}.amazonaws.com".format( self._sm_config['aws']['aws_region'])) self._sc = SparkContext(master=self._sm_config['spark']['master'], conf=sconf, appName='SM engine') def _init_db(self): logger.info('Connecting to the DB') self._db = DB(self._sm_config['db']) def store_job_meta(self, mol_db_id): """ Store search job metadata in the database """ logger.info('Storing job metadata') rows = [(mol_db_id, self._ds.id, JobStatus.RUNNING, datetime.now().strftime('%Y-%m-%d %H:%M:%S'))] self._job_id = self._db.insert_return(JOB_INS, rows=rows)[0] def _run_annotation_job(self, mol_db): try: self.store_job_meta(mol_db.id) mol_db.set_job_id(self._job_id) logger.info( "Running new job ds_id: %s, ds_name: %s, db_name: %s, db_version: %s", self._ds.id, self._ds.name, mol_db.name, mol_db.version) target_adducts = self._ds.config['isotope_generation']['adducts'] self._fdr = FDR(job_id=self._job_id, decoy_sample_size=20, target_adducts=target_adducts, db=self._db) isocalc = IsocalcWrapper(self._ds.config['isotope_generation']) centroids_gen = IonCentroidsGenerator(sc=self._sc, moldb_name=mol_db.name, isocalc=isocalc) polarity = self._ds.config['isotope_generation']['charge'][ 'polarity'] all_adducts = list( set(self._sm_config['defaults']['adducts'][polarity]) | set(DECOY_ADDUCTS)) centroids_gen.generate_if_not_exist(isocalc=isocalc, sfs=mol_db.sfs, adducts=all_adducts) target_ions = centroids_gen.ions(target_adducts) self._fdr.decoy_adducts_selection(target_ions) search_alg = MSMBasicSearch(sc=self._sc, ds=self._ds, ds_reader=self._ds_reader, mol_db=mol_db, centr_gen=centroids_gen, fdr=self._fdr, ds_config=self._ds.config) ion_metrics_df, ion_iso_images = search_alg.search() search_results = SearchResults(mol_db.id, self._job_id, search_alg.metrics.keys()) mask = self._ds_reader.get_2d_sample_area_mask() img_store_type = self._ds.get_ion_img_storage_type(self._db) search_results.store(ion_metrics_df, ion_iso_images, mask, self._db, self._img_store, img_store_type) except Exception as e: self._db.alter( JOB_UPD_STATUS_FINISH, params=(JobStatus.FAILED, datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id)) msg = 'Job failed(ds_id={}, mol_db={}): {}'.format( self._ds.id, mol_db, str(e)) raise JobFailedError(msg) from e else: self._db.alter( JOB_UPD_STATUS_FINISH, params=(JobStatus.FINISHED, datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id)) def _remove_annotation_job(self, mol_db): logger.info( "Removing job results ds_id: %s, ds_name: %s, db_name: %s, db_version: %s", self._ds.id, self._ds.name, mol_db.name, mol_db.version) self._db.alter('DELETE FROM job WHERE ds_id = %s and db_id = %s', params=(self._ds.id, mol_db.id)) self._es.delete_ds(self._ds.id, mol_db) def _moldb_ids(self): completed_moldb_ids = { db_id for (_, db_id) in self._db.select(JOB_ID_MOLDB_ID_SEL, params=(self._ds.id, )) } new_moldb_ids = { MolecularDB(name=moldb_name).id for moldb_name in self._ds.config['databases'] } return completed_moldb_ids, new_moldb_ids def _save_data_from_raw_ms_file(self): ms_file_type_config = SMConfig.get_ms_file_handler( self._wd_manager.local_dir.ms_file_path) acq_geometry_factory_module = ms_file_type_config[ 'acq_geometry_factory'] acq_geometry_factory = getattr( import_module(acq_geometry_factory_module['path']), acq_geometry_factory_module['name']) acq_geometry = acq_geometry_factory( self._wd_manager.local_dir.ms_file_path).create() self._ds.save_acq_geometry(self._db, acq_geometry) self._ds.save_ion_img_storage_type( self._db, ms_file_type_config['img_storage_type']) def run(self, ds): """ Entry point of the engine. Molecule search is completed in several steps: * Copying input data to the engine work dir * Conversion input mass spec files to plain text format. One line - one spectrum data * Generation and saving to the database theoretical peaks for all formulas from the molecule database * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner. * Saving results (isotope images and their metrics of quality for each putative molecule) to the database Args ---- ds : sm.engine.dataset_manager.Dataset """ try: logger.info('*' * 150) start = time.time() self._init_db() self._es = ESExporter(self._db) self._ds = ds if self._sm_config['rabbitmq']: self._status_queue = QueuePublisher( config=self._sm_config['rabbitmq'], qdesc=SM_DS_STATUS, logger=logger) else: self._status_queue = None self._wd_manager = WorkDirManager(ds.id) self._configure_spark() if not self.no_clean: self._wd_manager.clean() self._ds_reader = DatasetReader(self._ds.input_path, self._sc, self._wd_manager) self._ds_reader.copy_convert_input_data() self._save_data_from_raw_ms_file() self._img_store.storage_type = self._ds.get_ion_img_storage_type( self._db) logger.info('Dataset config:\n%s', pformat(self._ds.config)) completed_moldb_ids, new_moldb_ids = self._moldb_ids() for moldb_id in completed_moldb_ids.symmetric_difference( new_moldb_ids): # ignore ids present in both sets mol_db = MolecularDB( id=moldb_id, db=self._db, iso_gen_config=self._ds.config['isotope_generation']) if moldb_id not in new_moldb_ids: self._remove_annotation_job(mol_db) elif moldb_id not in completed_moldb_ids: self._run_annotation_job(mol_db) logger.info("All done!") time_spent = time.time() - start logger.info('Time spent: %d mins %d secs', *divmod(int(round(time_spent)), 60)) finally: if self._sc: self._sc.stop() if self._db: self._db.close() if self._wd_manager and not self.no_clean: self._wd_manager.clean() logger.info('*' * 150)
def run_coloc_jobs( sm_config, ds_id_str, sql_where, fix_missing, fix_corrupt, skip_existing, use_lithops ): assert ( len( [ data_source for data_source in [ds_id_str, sql_where, fix_missing, fix_corrupt] if data_source ] ) == 1 ), "Exactly one data source (ds_id, sql_where, fix_missing, fix_corrupt) must be specified" assert not (ds_id_str and sql_where) db = DB() if ds_id_str: ds_ids = ds_id_str.split(',') elif sql_where: ds_ids = [ id for (id,) in db.select(f'SELECT DISTINCT dataset.id FROM dataset WHERE {sql_where}') ] else: mol_dbs = [ (doc['id'], doc['name']) for doc in db.select_with_fields('SELECT id, name FROM molecular_db m') ] mol_db_ids, mol_db_names = map(list, zip(*mol_dbs)) fdrs = [0.05, 0.1, 0.2, 0.5] algorithms = ['median_thresholded_cosine', 'cosine'] if fix_missing: logger.info('Checking for missing colocalization jobs...') results = db.select( MISSING_COLOC_JOBS_SEL, [list(mol_db_ids), list(mol_db_names), fdrs, algorithms] ) ds_ids = [ds_id for ds_id, in results] logger.info(f'Found {len(ds_ids)} missing colocalization sets') else: logger.info( 'Checking all colocalization jobs. ' 'This is super slow: ~5 minutes per 1000 datasets...' ) results = db.select( CORRUPT_COLOC_JOBS_SEL, [list(mol_db_ids), list(mol_db_names), fdrs, algorithms] ) ds_ids = [ds_id for ds_id, in results] logger.info(f'Found {len(ds_ids)} corrupt colocalization sets') if not ds_ids: logger.warning('No datasets match filter') return if use_lithops: executor = Executor(sm_config['lithops']) for i, ds_id in enumerate(ds_ids): try: logger.info(f'Running colocalization on {i+1} out of {len(ds_ids)}') ds = Dataset.load(db, ds_id) coloc = Colocalization(db) if use_lithops: # noinspection PyUnboundLocalVariable coloc.run_coloc_job_lithops(executor, ds, reprocess=not skip_existing) else: coloc.run_coloc_job(ds, reprocess=not skip_existing) except Exception: logger.error(f'Failed to run colocalization on {ds_id}', exc_info=True)
def test_sm_daemons( MSMSearchMock, call_off_sample_api_mock, post_images_to_image_store_mock, get_image_mock, get_ion_images_for_analysis_mock, # fixtures test_db, es_dsl_search, clean_isotope_storage, reset_queues, metadata, ds_config, queue_pub, local_sm_config, ): moldb = init_moldb() formula_metrics_df = pd.DataFrame({ 'formula_i': [0, 1, 2], 'ion_formula': ['C12H24O-H2O+H', 'C12H24O-H2+O2-CO+Na', 'C12H24O'], 'formula': ['C12H24O', 'C12H24O', 'C12H24O'], 'modifier': ['-H2O+H', '-H2+O2-CO+Na', ''], 'chem_mod': ['', '-H2+O2', ''], 'neutral_loss': ['-H2O', '-CO', ''], 'adduct': ['+H', '+Na', '[M]+'], 'chaos': [0.9, 0.9, 0.9], 'spatial': [0.9, 0.9, 0.9], 'spectral': [0.9, 0.9, 0.9], 'msm': [0.9**3, 0.9**3, 0.9**3], 'total_iso_ints': [[100.0], [100.0], [100.0]], 'min_iso_ints': [[0], [0], [0]], 'max_iso_ints': [[10.0], [10.0], [10.0]], 'fdr': [0.1, 0.1, 0.1], }).set_index('formula_i') search_algo_mock = MSMSearchMock() def mock_search(*args): # Read all spectra so that ImzML diagnostic fields are populated imzml_reader = MSMSearchMock.call_args_list[-1][1]['imzml_reader'] _ = list(imzml_reader.iter_spectra(range(imzml_reader.n_spectra))) return [(formula_metrics_df, [], create_test_fdr_diagnostics_bundle())] search_algo_mock.search.side_effect = mock_search search_algo_mock.metrics = OrderedDict([ ('chaos', 0), ('spatial', 0), ('spectral', 0), ('msm', 0), ('total_iso_ints', []), ('min_iso_ints', []), ('max_iso_ints', []), ]) image_ids = ['iso_image_1', None, None, None] post_images_to_image_store_mock.return_value = { 0: image_ids, 1: image_ids, 2: image_ids } db = DB() es = ESExporter(db, local_sm_config) ds = create_test_ds( name=test_ds_name, input_path=input_dir_path, config={ **ds_config, 'database_ids': [moldb.id] }, status=DatasetStatus.QUEUED, es=es, ) queue_pub.publish({ 'ds_id': ds.id, 'ds_name': test_ds_name, 'action': DaemonAction.ANNOTATE }) run_daemons(db, es, local_sm_config) # dataset table asserts rows = db.select( 'SELECT id, name, input_path, upload_dt, status from dataset') input_path = join(dirname(__file__), 'data', test_ds_name) assert len(rows) == 1 assert rows[0] == (ds.id, test_ds_name, input_path, ds.upload_dt, DatasetStatus.FINISHED) # ms acquisition geometry asserts rows = db.select('SELECT acq_geometry from dataset') assert len(rows) == 1 assert rows[0][0] == ds.get_acq_geometry(db) assert rows[0][0] == { 'length_unit': 'nm', 'acquisition_grid': { 'regular_grid': True, 'count_x': 3, 'count_y': 3 }, 'pixel_size': { 'regular_size': True, 'size_x': 100, 'size_y': 100 }, } # job table asserts rows = db.select('SELECT moldb_id, ds_id, status, start, finish from job') assert len(rows) == 1 moldb_id, ds_id, status, start, finish = rows[0] assert (moldb_id, ds_id, status) == (moldb.id, ds.id, JobStatus.FINISHED) assert start <= finish # image metrics asserts rows = db.select( 'SELECT formula, adduct, msm, stats, iso_image_ids FROM annotation') rows = sorted( rows, key=lambda row: row[1] ) # Sort in Python because postgres sorts symbols inconsistently between locales assert len(rows) == 3 for row, expected_adduct in zip(rows, ['+H', '+Na', '[M]+']): formula, adduct, msm, stats, iso_image_ids = row assert formula == 'C12H24O' assert adduct == expected_adduct assert np.isclose(msm, 0.9**3) assert stats == { 'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9, 'total_iso_ints': [100.0], 'min_iso_ints': [0], 'max_iso_ints': [10.0], } assert iso_image_ids == ['iso_image_1', None, None, None] time.sleep(1) # Waiting for ES # ES asserts ds_docs = es_dsl_search.query( 'term', _type='dataset').execute().to_dict()['hits']['hits'] assert 1 == len(ds_docs) ann_docs = es_dsl_search.query( 'term', _type='annotation').execute().to_dict()['hits']['hits'] assert len(ann_docs) == len(rows) for doc in ann_docs: assert doc['_id'].startswith(ds_id)
class TheorPeaksGenerator(object): """ Generator of theoretical isotope peaks for all molecules in a database. Args ---------- sc : pyspark.SparkContext sm_config : dict SM engine config ds_config : dict Dataset config """ def __init__(self, sc, sm_config, ds_config): # TODO: replace sm_config with db self.sc = sc self.sm_config = sm_config self.ds_config = ds_config self.theor_peaks_tmp_dir = join(sm_config['fs']['base_path'], 'tmp_theor_peaks_gen') self.db = DB(sm_config['db']) self.adducts = self.ds_config['isotope_generation']['adducts'] self.isocalc_wrapper = IsocalcWrapper(self.ds_config['isotope_generation']) @staticmethod def _sf_elements(sf): return [seg.element().name() for seg in parseSumFormula(sf).get_segments()] @classmethod def _valid_sf_adduct(cls, sf, adduct): if sf is None or adduct is None or sf == 'None' or adduct == 'None': logger.warning('Invalid sum formula or adduct: sf=%s, adduct=%s', sf, adduct) return False if '-' in adduct and adduct.strip('-') not in cls._sf_elements(sf): logger.info('No negative adduct element in the sum formula: sf=%s, adduct=%s', sf, adduct) return False return True def run(self): """ Starts peaks generation. Checks all formula peaks saved in the database and generates peaks only for new ones""" logger.info('Running theoretical peaks generation') db_id = self.db.select_one(DB_ID_SEL, self.ds_config['database']['name'])[0] formula_list = self.apply_database_filters(self.db.select(AGG_FORMULA_SEL, db_id)) stored_sf_adduct = self.db.select(SF_ADDUCT_SEL, db_id, self.isocalc_wrapper.sigma, self.isocalc_wrapper.charge, self.isocalc_wrapper.pts_per_mz) sf_adduct_cand = self.find_sf_adduct_cand(formula_list, set(stored_sf_adduct)) logger.info('%d saved (sf, adduct)s, %s not saved (sf, adduct)s', len(stored_sf_adduct), len(sf_adduct_cand)) if sf_adduct_cand: self.generate_theor_peaks(sf_adduct_cand) def apply_database_filters(self, formula_list): """ Filters according to settings in dataset config Args ---- formula_list : list List of pairs (id, sum formula) to search through Returns ------- : list Filtered list of pairs (id, sum formula) """ if 'organic' in map(lambda s: s.lower(), self.ds_config['database'].get('filters', [])): logger.info('Organic sum formula filter has been applied') return filter(lambda (_, sf): 'C' in self._sf_elements(sf), formula_list) return formula_list def find_sf_adduct_cand(self, formula_list, stored_sf_adduct): """ Args ---- formula_list : list List of pairs (id, sum formula) to search through stored_sf_adduct : set Set of (formula, adduct) pairs which have theoretical patterns saved in the database Returns ------- : list List of (formula id, formula, adduct) triples which don't have theoretical patterns saved in the database """ assert formula_list, 'Emtpy agg_formula table!' adducts = set(self.adducts) | set(DECOY_ADDUCTS) cand = [(id, sf, a) for (id, sf) in formula_list for a in adducts] return filter(lambda (sf_id, sf, adduct): (sf, adduct) not in stored_sf_adduct, cand) def generate_theor_peaks(self, sf_adduct_cand): """ Args ---- sf_adduct_cand : list List of (formula id, formula, adduct) triples which don't have theoretical patterns saved in the database Returns ------- : list List of strings with formatted theoretical peaks data """ logger.info('Generating missing peaks') formatted_iso_peaks = self.isocalc_wrapper.formatted_iso_peaks db_id = self.db.select_one(DB_ID_SEL, self.ds_config['database']['name'])[0] n = 10000 for i in xrange(0, len(sf_adduct_cand), n): sf_adduct_cand_rdd = self.sc.parallelize(sf_adduct_cand[i:i+n], numSlices=128) peak_lines = (sf_adduct_cand_rdd .flatMap(lambda (sf_id, sf, adduct): formatted_iso_peaks(db_id, sf_id, sf, adduct)) .collect()) self._import_theor_peaks_to_db(peak_lines) def _import_theor_peaks_to_db(self, peak_lines): logger.info('Saving new peaks to the DB') if not exists(self.theor_peaks_tmp_dir): makedirs(self.theor_peaks_tmp_dir) peak_lines_path = join(self.theor_peaks_tmp_dir, 'peak_lines.csv') with open(peak_lines_path, 'w') as f: f.write('\n'.join(peak_lines)) with open(peak_lines_path) as peaks_file: self.db.copy(peaks_file, 'theor_peaks')