def test_add_optical_image(self, fill_db, sm_config, ds_config): db = DB(sm_config['db']) action_queue_mock = MagicMock(spec=QueuePublisher) es_mock = MagicMock(spec=ESExporter) img_store_mock = MagicMock(ImageStoreServiceWrapper) img_store_mock.post_image.side_effect = [ 'opt_img_id1', 'opt_img_id2', 'opt_img_id3', 'thumbnail_id' ] img_store_mock.get_image_by_id.return_value = Image.new( 'RGB', (100, 100)) ds_man = create_api_ds_man(sm_config=sm_config, db=db, es=es_mock, img_store=img_store_mock, annot_queue=action_queue_mock) ds_man._annotation_image_shape = MagicMock(return_value=(100, 100)) ds_id = '2000-01-01' ds = create_ds(ds_id=ds_id, ds_config=ds_config) zoom_levels = [1, 2, 3] raw_img_id = 'raw_opt_img_id' ds_man.add_optical_image(ds, raw_img_id, [[1, 0, 0], [0, 1, 0], [0, 0, 1]], zoom_levels=zoom_levels) assert db.select('SELECT * FROM optical_image') == [ ('opt_img_id{}'.format(i + 1), ds.id, zoom) for i, zoom in enumerate(zoom_levels) ] assert db.select('SELECT optical_image FROM dataset where id = %s', params=(ds_id, )) == [(raw_img_id, )] assert db.select('SELECT thumbnail FROM dataset where id = %s', params=(ds_id, )) == [('thumbnail_id', )]
def get_ds_moldb_ids(ds_id: str, status: Optional[str] = None): if status is not None: return DB().select_onecol( 'SELECT j.moldb_id FROM job j WHERE ds_id = %s AND status = %s', (ds_id, status)) return DB().select_onecol('SELECT j.moldb_id FROM job j WHERE ds_id = %s', (ds_id, ))
def run_off_sample(sm_config, ds_ids_str, sql_where, fix_missing, overwrite_existing): db = DB() ds_ids = None if ds_ids_str: ds_ids = ds_ids_str.split(',') elif sql_where: ds_ids = [ id for (id,) in db.select(f'SELECT DISTINCT dataset.id FROM dataset WHERE {sql_where}') ] elif fix_missing: logger.info('Checking for missing off-sample jobs...') results = db.select(MISSING_OFF_SAMPLE_SEL) ds_ids = [ds_id for ds_id, in results] logger.info(f'Found {len(ds_ids)} missing off-sample sets') if not ds_ids: logger.warning('No datasets match filter') return es_exp = ESExporter(db, sm_config) for i, ds_id in enumerate(ds_ids): try: logger.info(f'Running off-sample on {i+1} out of {len(ds_ids)}') ds = Dataset.load(db, ds_id) classify_dataset_ion_images(db, ds, sm_config['services'], overwrite_existing) es_exp.reindex_ds(ds_id) except Exception: logger.error(f'Failed to run off-sample on {ds_id}', exc_info=True)
def test_classify_ion_images_preds_saved(call_api_mock, image_storage_mock, fill_db): call_api_mock.return_value = { 'predictions': [{'prob': 0.1, 'label': 'on'}, {'prob': 0.9, 'label': 'off'}] } fp = io.BytesIO() Image.new('RGBA', (10, 10)).save(fp, format='PNG') fp.seek(0) img_bytes = fp.read() image_storage_mock.get_image.return_value = img_bytes db = DB() ds_id = '2000-01-01' ds = Dataset.load(db, ds_id) services_config = defaultdict(str) classify_dataset_ion_images(db, ds, services_config) annotations = db.select_with_fields( ( 'select off_sample ' 'from dataset d ' 'join job j on j.ds_id = d.id ' 'join annotation m on m.job_id = j.id ' 'where d.id = %s ' 'order by m.id ' ), params=(ds_id,), ) exp_annotations = [ {'off_sample': {'prob': 0.1, 'label': 'on'}}, {'off_sample': {'prob': 0.9, 'label': 'off'}}, ] assert annotations == exp_annotations
def test_create_moldb(fill_db, is_public): input_doc = moldb_input_doc( file_path=f's3://{BUCKET_NAME}/{MoldbFiles.VALID.value}', is_public=is_public) with patch_bottle_request(input_doc) as input_doc: resp = api.databases.create() assert resp['status'] == 'success' resp_doc = resp['data'] db = DB() doc = db.select_one_with_fields( 'SELECT id, name, version, group_id, is_public FROM molecular_db where id = %s', params=(resp_doc['id'], ), ) for field in ['name', 'version', 'group_id', 'is_public']: assert doc[field] == input_doc[field] docs = db.select_with_fields( 'SELECT * FROM molecule WHERE moldb_id = %s', params=(resp_doc['id'], ), ) for doc in docs: print(doc) for field in ['mol_id', 'mol_name', 'formula', 'inchi']: assert field in doc
def test_save_sf_iso_images_correct_db_call(spark_context, create_fill_sm_database, sm_config, ds_config): sf_iso_imgs = spark_context.parallelize([((1, '+H'), [ csr_matrix([[100, 0, 0], [0, 0, 0]]), csr_matrix([[0, 0, 0], [0, 0, 10]]) ])]) sf_adduct_peaksn = [(1, '+H', 2)] res = SearchResults(0, 0, 0, 'ds_name', sf_adduct_peaksn, db_mock, sm_config, ds_config) res.sf_iso_images = sf_iso_imgs res.nrows, res.ncols = 2, 3 res.store_sf_iso_images() correct_rows = [(0, 0, 1, '+H', 0, [0], [100], 0, 100), (0, 0, 1, '+H', 1, [5], [10], 0, 10)] db = DB(sm_config['db']) try: rows = db.select(( 'SELECT job_id, db_id, sf_id, adduct, peak, pixel_inds, intensities, min_int, max_int ' 'FROM iso_image ' 'ORDER BY sf_id, adduct')) assert correct_rows == rows finally: db.close()
def update_optical_images(ds_id_str, sql_where): db = DB() if ds_id_str: ds_ids = ds_id_str.split(',') else: ds_ids = [ id for (id, ) in db.select( f'SELECT DISTINCT dataset.id FROM dataset WHERE {sql_where}') ] for i, ds_id in enumerate(ds_ids): try: transform, img_id = db.select_one( 'SELECT transform, optical_image from dataset WHERE id = %s', params=(ds_id, )) if img_id and transform: logger.info( f'[{i + 1}/{len(ds_ids)}] Updating optical image of dataset {ds_id}' ) add_optical_image(db, ds_id, img_id, transform) else: logger.info( f'[{i + 1}/{len(ds_ids)}] Skipping dataset {ds_id}') except Exception: logger.error(f'Failed to update optical image on {ds_id}', exc_info=True)
def reindex_results(sm_config, ds_id, ds_mask, use_inactive_index, offline_reindex, update_fields): assert ds_id or ds_mask or offline_reindex IsocalcWrapper.set_centroids_cache_enabled(True) if offline_reindex: _reindex_all(sm_config) else: es_config = sm_config['elasticsearch'] if use_inactive_index: es_config = get_inactive_index_es_config(es_config) db = DB() es_exp = ESExporter(db, sm_config={ **sm_config, 'elasticsearch': es_config }) if ds_id: ds_ids = ds_id.split(',') elif ds_mask: ds_ids = [ id for (id, ) in db.select( "select id from dataset where name like '{}%'".format( ds_mask)) ] else: ds_ids = [] if update_fields: _partial_update_datasets(ds_ids, es_exp, update_fields.split(',')) else: _reindex_datasets(ds_ids, es_exp)
def run(sm_config, ds_id_str, sql_where, algorithm, use_lithops): db = DB() if sql_where: ds_ids = [ id for (id, ) in db.select( f'SELECT DISTINCT dataset.id FROM dataset WHERE {sql_where}') ] else: ds_ids = ds_id_str.split(',') if not ds_ids: logger.warning('No datasets match filter') return if use_lithops: executor = Executor(sm_config['lithops']) for i, ds_id in enumerate(ds_ids): try: logger.info( f'[{i+1} / {len(ds_ids)}] Generating ion thumbnail for {ds_id}' ) ds = Dataset.load(db, ds_id) if use_lithops: # noinspection PyUnboundLocalVariable generate_ion_thumbnail_lithops(executor, db, ds, algorithm=algorithm) else: generate_ion_thumbnail(db, ds, algorithm=algorithm) except Exception: logger.error(f'Failed on {ds_id}', exc_info=True)
def test_save_ds_meta_ds_doesnt_exist(spark_context, create_test_db, drop_test_db, sm_config, ds_config): work_dir_man_mock = MagicMock(WorkDirManager) work_dir_man_mock.ds_coord_path = '/ds_path' work_dir_man_mock.txt_path = '/txt_path' SMConfig._config_dict = sm_config with patch('sm.engine.tests.util.SparkContext.textFile') as m: m.return_value = spark_context.parallelize([ '0,1,1\n', '1,100,200\n']) dataset = Dataset(spark_context, 'ds_name', '', ds_config, work_dir_man_mock, DB(sm_config['db'])) dataset.save_ds_meta() db = DB(sm_config['db']) ds_row = db.select_one('SELECT name, file_path, img_bounds, config from dataset') assert ds_row == ('ds_name', '/txt_path', {u'x': {u'min': 1, u'max': 100}, u'y': {u'min': 1, u'max': 200}}, ds_config) coord_row = db.select_one('SELECT xs, ys from coordinates') assert coord_row == ([1, 100], [1, 200]) db.close()
def __init__(self, sm_config, analysis_version, database): reports_path = Path(proj_root()) / 'tests/reports' timestamp = datetime.now().replace(microsecond=0).isoformat().replace( ':', '-') suffix = f'{database}-v{analysis_version}' self.sm_config = sm_config self.db = DB() self.ds_id = '2000-01-01_00h00m01s' self.ref_results_path = reports_path / f'spheroid-{suffix}.csv' self.output_results_path = reports_path / f'test-{suffix}-{timestamp}.csv' self.ds_name = 'sci_test_spheroid_untreated' self.ds_data_path = join(self.sm_config['fs']['spark_data_path'], self.ds_name) self.moldb = MOL_DBS[database] self.analysis_version = analysis_version self.input_path = join(proj_root(), 'tests/data/untreated') self.ds_config_path = join(self.input_path, 'config.json') self.metrics = [ 'chaos', 'spatial', 'spectral', 'mz_err_abs', 'mz_err_rel', 'msm', 'fdr' ] self.comparison_df = None
def _upload_moldbs_from_db(moldb_ids, storage, sm_storage): moldb_defs = [] bucket, prefix = sm_storage['moldb'] # Sort the moldbs because the centroids cache key is affected by their order for moldb_id in sorted(moldb_ids): key = f'{prefix}/{moldb_id}' try: storage.head_object(bucket, key) logger.debug(f'Found mol db at {key}') cobject = CloudObject(storage.backend, bucket, key) except StorageNoSuchKeyError: logger.info(f'Uploading {key}...') mols_query = DB().select( 'SELECT DISTINCT formula FROM molecule WHERE moldb_id = %s', (moldb_id, )) mols = [mol for mol, in mols_query] cobject = save_cobj(storage, mols, bucket=bucket, key=key) logger.info(f'Uploading {key}...Done') (targeted, ) = DB().select_one( 'SELECT targeted FROM molecular_db WHERE id = %s', (moldb_id, )) moldb_defs.append({ 'id': moldb_id, 'cobj': cobject, 'targeted': targeted }) return moldb_defs
def init_moldb(): db = DB() moldb = create_test_molecular_db() db.insert( "INSERT INTO molecule (mol_id, mol_name, formula, moldb_id) VALUES (%s, %s, %s, %s)", rows=[('HMDB0001', 'molecule name', 'C12H24O', moldb.id)], ) return moldb
def test_delete_ds(self, EsMock, fill_db): db = DB() manager = create_daemon_man(db=db, es=EsMock()) ds_id = '2000-01-01' ds = create_ds(ds_id=ds_id) manager.delete(ds) EsMock.return_value.delete_ds.assert_has_calls([call(ds_id)]) assert db.select_one('SELECT * FROM dataset WHERE id = %s', params=(ds_id,)) == []
def reindex_all_results(conf): db = DB(conf['db']) es_exp = ESExporter(conf) es_exp.delete_index(name='sm') es_exp.create_index(name='sm') ds_db_pairs = db.select("select name, config -> 'database'::text -> 'name'::text from dataset") for ds_name, db_name in ds_db_pairs: es_exp.index_ds(db, ds_name, db_name)
def reindex_all_results(conf): db = DB(conf['db']) es_exp = ESExporter(conf) es_exp.delete_index(name='sm') es_exp.create_index(name='sm') ds_db_pairs = db.select( "select name, config -> 'database'::text -> 'name'::text from dataset") for ds_name, db_name in ds_db_pairs: es_exp.index_ds(db, ds_name, db_name)
def test_theor_peaks_generator_run_failed_iso_peaks(create_fill_test_db, spark_context, sm_config, ds_config): ds_config["isotope_generation"]["adducts"] = ["+Na"] theor_peaks_gen = TheorPeaksGenerator(spark_context, sm_config, ds_config) theor_peaks_gen.isocalc_wrapper.isotope_peaks = lambda *args: Centroids([], []) theor_peaks_gen.run() db = DB(sm_config['db']) rows = db.select('SELECT * FROM theor_peaks') assert len(rows) == 1 db.close()
def test_create_moldb_malformed_csv(file, fill_db): input_doc = moldb_input_doc(file_path=f's3://{BUCKET_NAME}/{file.value}') with patch_bottle_request(input_doc): resp = api.databases.create() assert resp['status'] == MALFORMED_CSV['status'] assert resp['error'] db = DB() (db_count, ) = db.select_one(MOLDB_COUNT_SEL) assert db_count == 0
def test_delete_moldb(fill_db): input_doc = moldb_input_doc( file_path=f's3://{BUCKET_NAME}/{MoldbFiles.VALID.value}') moldb = create_test_molecular_db(**input_doc) with patch_bottle_request(req_doc={}): resp = api.databases.delete(moldb_id=moldb.id) assert resp['status'] == 'success' db = DB() (db_count, ) = db.select_one(MOLDB_COUNT_SEL) assert db_count == 0
def save_additional_info_to_db(db_id, user_id, input_path): conf = SMConfig.get_conf() with ConnectionPool(conf['db']): db = DB() if db.select_one('SELECT * FROM molecular_db WHERE id = %s', (db_id, )): print(f'Updating existing molecular database {db_id}') DB().alter( 'UPDATE molecular_db SET user_id = %s, input_path = %s WHERE id = %s', (user_id, input_path, db_id), ) else: print(f'Specified molecular database {db_id} does not exist.')
def test_create_moldb_empty_values(fill_db): input_doc = moldb_input_doc( file_path=f's3://{BUCKET_NAME}/{MoldbFiles.EMPTY_VALUES.value}') with patch_bottle_request(input_doc): resp = api.databases.create() assert resp['status'] == BAD_DATA['status'] assert resp['error'] and resp['details'] db = DB() (db_count, ) = db.select_one(MOLDB_COUNT_SEL) assert db_count == 0
def __init__(self, sc, sm_config, ds_config): self.sc = sc self.sm_config = sm_config self.ds_config = ds_config self.theor_peaks_tmp_dir = join(sm_config['fs']['base_path'], 'tmp_theor_peaks_gen') self.db = DB(sm_config['db']) self.adducts = self.ds_config['isotope_generation']['adducts'] self.isocalc_wrapper = IsocalcWrapper( self.ds_config['isotope_generation'])
def fill_db(test_db, sm_config, ds_config): upload_dt = '2000-01-01 00:00:00' ds_id = '2000-01-01' metadata = {"meta": "data"} db = DB(sm_config['db']) db.insert(( 'INSERT INTO dataset (id, name, input_path, upload_dt, metadata, config, status, ' 'is_public, mol_dbs, adducts) ' 'VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'), rows=[(ds_id, 'ds_name', 'input_path', upload_dt, json.dumps(metadata), json.dumps(ds_config), DatasetStatus.FINISHED, True, ['HMDB-v4' ], ['+H', '+Na', '+K'])])
def test_sm_daemons_annot_fails( MSMSearchMock, post_images_to_image_store_mock, test_db, es_dsl_search, clean_isotope_storage, reset_queues, metadata, ds_config, queue_pub, local_sm_config, ): moldb = init_moldb() def throw_exception_function(*args, **kwargs): raise Exception('Test exception') msm_algo_mock = MSMSearchMock() msm_algo_mock.search.side_effect = throw_exception_function image_ids = ['iso_image_1', None, None, None] post_images_to_image_store_mock.return_value = { 0: image_ids, 1: image_ids, 2: image_ids } db = DB() es = ESExporter(db, local_sm_config) ds = create_test_ds( name=test_ds_name, input_path=input_dir_path, config={ **ds_config, 'database_ids': [moldb.id] }, status=DatasetStatus.QUEUED, es=es, ) queue_pub.publish({ 'ds_id': ds.id, 'ds_name': test_ds_name, 'action': DaemonAction.ANNOTATE }) run_daemons(db, es, local_sm_config) # dataset and job tables asserts row = db.select_one('SELECT status from dataset') assert len(row) == 1 assert row[0] == 'FAILED'
def _import_molecules(moldb, moldb_df, targeted_threshold): logger.info(f'{moldb}: importing {len(moldb_df)} molecules') columns = ['moldb_id', 'mol_id', 'mol_name', 'formula'] buffer = StringIO() moldb_df = moldb_df.assign(moldb_id=int(moldb.id)) moldb_df[columns].to_csv(buffer, sep='\t', index=False, header=False) buffer.seek(0) DB().copy(buffer, sep='\t', table='molecule', columns=columns) logger.info(f'{moldb}: inserted {len(moldb_df)} molecules') targeted = moldb_df.formula.unique().shape[0] <= targeted_threshold DB().alter('UPDATE molecular_db SET targeted = %s WHERE id = %s', params=(targeted, moldb.id))
def __init__(self, sm_config_path): self.sm_config_path = sm_config_path self.sm_config = SMConfig.get_conf() self.db = DB(self.sm_config['db']) self.ds_id = '2000-01-01-00_00_00' self.base_search_res_path = join(proj_root(), 'tests/reports', 'spheroid_untreated_search_res.csv') self.ds_name = 'sci_test_spheroid_untreated' self.data_dir_path = join(self.sm_config['fs']['base_path'], self.ds_name) self.input_path = join(proj_root(), 'tests/data/untreated') self.ds_config_path = join(self.input_path, 'config.json') self.metrics = ['chaos', 'spatial', 'spectral']
def migrate_optical_images(ds_id): output.print('Migrating optical images') with timeit(): output.print('Transferring images and updating database...') db = DB() rows = db.select(SEL_OPTICAL_IMGS, params=(ds_id,)) for opt_image_id, opt_image_url in rows: if not opt_image_url and opt_image_id: transfer_images( ds_id, 'optical_images', image_storage.OPTICAL, [opt_image_id], ) opt_image_url = image_storage.get_image_url( image_storage.OPTICAL, ds_id, opt_image_id ) db.alter(UPD_OPTICAL_IMGS, params=(opt_image_url, opt_image_id)) opt_thumb_id, opt_thumb_url = db.select_one(SEL_OPT_THUMB, params=(ds_id,)) if not opt_thumb_url and opt_thumb_id: transfer_images( ds_id, 'optical_images', image_storage.OPTICAL, [opt_thumb_id], ) opt_thumb_url = image_storage.get_image_url(image_storage.OPTICAL, ds_id, opt_thumb_id) db.alter(UPD_OPT_THUMB, params=(opt_thumb_url, ds_id))
def fill_test_db(create_test_db, drop_test_db): db_config = dict(database='sm_test', user='******', host='localhost', password='******') db = DB(db_config) try: db.alter('TRUNCATE dataset CASCADE') db.insert("INSERT INTO dataset VALUES (%s, %s, %s, %s, %s)", [(1, 'ds_name', '/ds_path', json.dumps({}), json.dumps({}))]) db.alter('TRUNCATE coordinates CASCADE') except: raise finally: db.close()
def create_fill_sm_database(create_test_db, drop_test_db, sm_config): local('psql -h localhost -U sm sm_test < {}'.format(join(proj_dir_path, 'scripts/create_schema.sql'))) db = DB(sm_config['db']) try: db.insert('INSERT INTO formula_db VALUES (%s, %s, %s)', [(0, '2016-01-01', 'HMDB')]) db.insert('INSERT INTO formula VALUES (%s, %s, %s, %s, %s)', [(100, 0, '00001', 'compound_name', 'C12H24O')]) db.insert('INSERT INTO agg_formula VALUES (%s, %s, %s, %s, %s)', [(10007, 0, 'C12H24O', ['00001'], ['compound_name'])]) except: raise finally: db.close()
def test_sm_daemons_annot_fails(get_compute_img_metrics_mock, filter_sf_metrics_mock, post_images_to_annot_service_mock, MolDBServiceWrapperMock, sm_config, test_db, es_dsl_search, clean_isotope_storage): init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock) def throw_exception_function(*args): raise Exception('Test') get_compute_img_metrics_mock.return_value = throw_exception_function filter_sf_metrics_mock.side_effect = lambda x: x url_dict = { 'iso_image_ids': ['iso_image_1', None, None, None] } post_images_to_annot_service_mock.return_value = { 35: url_dict, 44: url_dict } db = DB(sm_config['db']) es = ESExporter(db) annotate_daemon = None try: ds_id = '2000-01-01_00h00m' upload_dt = datetime.now() ds_config_str = open(ds_config_path).read() db.insert(Dataset.DS_INSERT, [{ 'id': ds_id, 'name': test_ds_name, 'input_path': input_dir_path, 'upload_dt': upload_dt, 'metadata': '{}', 'config': ds_config_str, 'status': DatasetStatus.QUEUED, 'is_public': True, 'mol_dbs': ['HMDB-v4'], 'adducts': ['+H', '+Na', '+K'], 'ion_img_storage': 'fs' }]) ds = Dataset.load(db, ds_id) queue_pub.publish({'ds_id': ds.id, 'ds_name': ds.name, 'action': 'annotate'}) run_daemons(db, es) # dataset and job tables asserts row = db.select_one('SELECT status from dataset') assert row[0] == 'FAILED' row = db.select_one('SELECT status from job') assert row[0] == 'FAILED' finally: db.close() if annotate_daemon: annotate_daemon.stop() with warn_only(): local('rm -rf {}'.format(data_dir_path))
def fill_test_db(create_test_db, drop_test_db): db_config = dict(database='sm_test', user='******', host='localhost', password='******') db = DB(db_config) try: db.alter('TRUNCATE dataset CASCADE') db.insert("INSERT INTO dataset VALUES (%s, %s, %s, %s, %s)", [(1, 'ds_name', '/ds_path', json.dumps({}), json.dumps({}))]) db.alter('TRUNCATE coordinates CASCADE') except: raise finally: db.close()
def migrate_isotopic_images(ds_id): output.print('Migrating isotopic images') db = DB() image_ids = db.select_onecol(SEL_DS_IMG_IDS, params=(ds_id,)) es_exporter = ESExporter(db, sm_config) if image_ids and not _es_docs_migrated(es_exporter._es, ds_id): with timeit(): output.print('Transferring images...') output.print(len(image_ids)) transfer_images(ds_id, 'iso_images', image_storage.ISO, image_ids) with timeit(): output.print('Reindexing ES documents...') es_exporter.reindex_ds(ds_id)
def migrate_ion_thumbnail(ds_id): output.print('Migrating ion thumbnail images') with timeit(): output.print('Transferring images and updating database...') db = DB() ion_thumb_id, ion_thumbnail_url = db.select_one(SEL_ION_THUMB, params=(ds_id,)) if not ion_thumbnail_url and ion_thumb_id: transfer_images( ds_id, 'ion_thumbnails', image_storage.THUMB, [ion_thumb_id], ) ion_thumb_url = image_storage.get_image_url(image_storage.THUMB, ds_id, ion_thumb_id) db.alter(UPD_ION_THUMB, params=(ion_thumb_url, ds_id))
class SciTester(object): def __init__(self, db_config): self.db = DB(db_config) self.base_search_res_path = join(proj_root(), 'tests/reports', 'spheroid_12h_search_res.csv') self.metrics = ['chaos', 'spatial', 'spectral'] def metr_dict_to_array(self, metr_d): return np.array([metr_d[m] for m in self.metrics]) def read_base_search_res(self): with open(self.base_search_res_path) as f: rows = map(lambda line: line.strip('\n').split('\t'), f.readlines()[1:]) return {(r[0], r[1]): np.array(r[2:], dtype=float) for r in rows} def fetch_search_res(self): rows = self.db.select(SEARCH_RES_SELECT, ds_name, 'HMDB') return {(r[0], r[1]): self.metr_dict_to_array(r[2]) for r in rows} def run_sci_test(self): compare_search_results(self.read_base_search_res(), self.fetch_search_res()) def save_sci_test_report(self): with open(self.base_search_res_path, 'w') as f: f.write('\t'.join(['sf', 'adduct'] + self.metrics) + '\n') for (sf, adduct), metrics in sorted(self.fetch_search_res().iteritems()): f.write('\t'.join([sf, adduct] + metrics.astype(str).tolist()) + '\n') print 'Successfully saved sample dataset search report'
def create_test_db(): db_config = dict(database='postgres', user='******', host='localhost') db = DB(db_config, autocommit=True) db.alter('DROP DATABASE IF EXISTS sm_test') db.alter('CREATE DATABASE sm_test') db.close() local('psql -h localhost -U sm sm_test < {}'.format(join(proj_root(), 'scripts/create_schema.sql')))
def create_fill_sm_database(create_test_db, drop_test_db, sm_config): local('psql -h localhost -U sm sm_test < {}'.format(join(proj_dir_path, 'scripts/create_schema.sql'))) db = DB(sm_config['db']) try: db.insert('INSERT INTO formula_db VALUES (%s, %s, %s)', [(0, '2016-01-01', 'HMDB')]) db.insert('INSERT INTO formula VALUES (%s, %s, %s, %s, %s)', [(100, 0, '00001', 'compound_name', 'C12H24O')]) db.insert('INSERT INTO agg_formula VALUES (%s, %s, %s, %s, %s)', [(10007, 0, 'C12H24O', ['00001'], ['compound_name'])]) except: raise finally: db.close()
def __init__(self, sc, sm_config, ds_config): # TODO: replace sm_config with db self.sc = sc self.sm_config = sm_config self.ds_config = ds_config self.theor_peaks_tmp_dir = join(sm_config['fs']['base_path'], 'tmp_theor_peaks_gen') self.db = DB(sm_config['db']) self.adducts = self.ds_config['isotope_generation']['adducts'] self.isocalc_wrapper = IsocalcWrapper(self.ds_config['isotope_generation'])
def __init__(self, sm_config_path): self.sm_config_path = sm_config_path self.sm_config = SMConfig.get_conf() self.db = DB(self.sm_config['db']) self.ds_id = '2000-01-01-00_00_00' self.base_search_res_path = join(proj_root(), 'tests/reports', 'spheroid_untreated_search_res.csv') self.ds_name = 'sci_test_spheroid_untreated' self.data_dir_path = join(self.sm_config['fs']['base_path'], self.ds_name) self.input_path = join(proj_root(), 'tests/data/untreated') self.ds_config_path = join(self.input_path, 'config.json') self.metrics = ['chaos', 'spatial', 'spectral']
def test_search_job_imzml_example_es_export_fails(get_compute_img_metrics_mock, filter_sf_metrics_mock, post_images_to_annot_service_mock, MolDBServiceWrapperMock, MolDBServiceWrapperMock2, sm_config, create_fill_sm_database, es_dsl_search, clean_isotope_storage): init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock) init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock2) get_compute_img_metrics_mock.return_value = lambda *args: (0.9, 0.9, 0.9, [100.], [0], [10.]) filter_sf_metrics_mock.side_effect = lambda x: x url_dict = { 'iso_image_ids': ['iso_image_1', None, None, None] } post_images_to_annot_service_mock.return_value = { 35: url_dict, 44: url_dict } db = DB(sm_config['db']) def throw_exception_function(*args): raise Exception('Test') try: ds_id = '2000-01-01_00h00m' upload_dt = datetime.now() ds_config_str = open(ds_config_path).read() db.insert(Dataset.DS_INSERT, [{ 'id': ds_id, 'name': test_ds_name, 'input_path': input_dir_path, 'upload_dt': upload_dt, 'metadata': '{}', 'config': ds_config_str, 'status': DatasetStatus.QUEUED, 'is_public': True, 'mol_dbs': ['HMDB-v4'], 'adducts': ['+H', '+Na', '+K'], 'ion_img_storage': 'fs' }]) with patch('sm.engine.search_job.ESExporter.index_ds') as index_ds_mock: index_ds_mock.side_effect = throw_exception_function img_store = ImageStoreServiceWrapper(sm_config['services']['img_service_url']) job = SearchJob(img_store=img_store) ds = Dataset.load(db, ds_id) job.run(ds) except ESExportFailedError as e: assert e # dataset table asserts row = db.select_one('SELECT status from dataset') assert row[0] == 'FAILED' else: raise AssertionError('ESExportFailedError should be raised') finally: db.close() with warn_only(): local('rm -rf {}'.format(data_dir_path))
def test_search_job_imzml_example(get_compute_img_measures_mock, create_fill_sm_database, sm_config): get_compute_img_measures_mock.return_value = lambda *args: (0.9, 0.9, 0.9) SMConfig._config_dict = sm_config db = DB(sm_config['db']) try: job = SearchJob(None, 'imzml_example_ds') job.run(input_dir_path, ds_config_path, clean=True) # dataset meta asserts rows = db.select("SELECT name, file_path, img_bounds from dataset") img_bounds = {u'y': {u'max': 3, u'min': 1}, u'x': {u'max': 3, u'min': 1}} file_path = 'file://' + join(data_dir_path, 'ds.txt') assert len(rows) == 1 assert rows[0] == (test_ds_name, file_path, img_bounds) # theoretical patterns asserts rows = db.select('SELECT db_id, sf_id, adduct, centr_mzs, centr_ints, prof_mzs, prof_ints ' 'FROM theor_peaks ' 'ORDER BY adduct') assert len(rows) == 3 + len(DECOY_ADDUCTS) for r in rows: assert r[3] and r[4] # image metrics asserts rows = db.select(('SELECT db_id, sf_id, adduct, peaks_n, stats FROM iso_image_metrics ' 'ORDER BY sf_id, adduct')) assert rows assert rows[0] assert tuple(rows[0][:2]) == (0, 10007) assert set(rows[0][4].keys()) == {'chaos', 'spatial', 'spectral'} # image asserts rows = db.select(('SELECT db_id, sf_id, adduct, peak, intensities, min_int, max_int ' 'FROM iso_image ' 'ORDER BY sf_id, adduct')) assert rows max_int = 0.0 for r in rows: max_int = max(max_int, r[-1]) assert tuple(r[:2]) == (0, 10007) assert max_int finally: db.close() with warn_only(): local('rm -rf {}'.format(data_dir_path))
class SearchJob(object): """ Main class responsible for molecule search. Uses other modules of the engine. Args ---- no_clean : bool Don't delete interim data files """ def __init__(self, img_store=None, no_clean=False): self.no_clean = no_clean self._img_store = img_store self._job_id = None self._sc = None self._db = None self._ds = None self._ds_reader = None self._status_queue = None self._fdr = None self._wd_manager = None self._es = None self._sm_config = SMConfig.get_conf() logger.debug('Using SM config:\n%s', pformat(self._sm_config)) def _configure_spark(self): logger.info('Configuring Spark') sconf = SparkConf() for prop, value in self._sm_config['spark'].items(): if prop.startswith('spark.'): sconf.set(prop, value) if 'aws' in self._sm_config: sconf.set("spark.hadoop.fs.s3a.access.key", self._sm_config['aws']['aws_access_key_id']) sconf.set("spark.hadoop.fs.s3a.secret.key", self._sm_config['aws']['aws_secret_access_key']) sconf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") sconf.set("spark.hadoop.fs.s3a.endpoint", "s3.{}.amazonaws.com".format(self._sm_config['aws']['aws_region'])) self._sc = SparkContext(master=self._sm_config['spark']['master'], conf=sconf, appName='SM engine') def _init_db(self): logger.info('Connecting to the DB') self._db = DB(self._sm_config['db']) def store_job_meta(self, mol_db_id): """ Store search job metadata in the database """ logger.info('Storing job metadata') rows = [(mol_db_id, self._ds.id, 'STARTED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))] self._job_id = self._db.insert_return(JOB_INS, rows=rows)[0] def _run_annotation_job(self, mol_db): try: self.store_job_meta(mol_db.id) mol_db.set_job_id(self._job_id) logger.info("Running new job ds_id: %s, ds_name: %s, db_name: %s, db_version: %s", self._ds.id, self._ds.name, mol_db.name, mol_db.version) target_adducts = self._ds.config['isotope_generation']['adducts'] self._fdr = FDR(job_id=self._job_id, decoy_sample_size=20, target_adducts=target_adducts, db=self._db) isocalc = IsocalcWrapper(self._ds.config['isotope_generation']) centroids_gen = IonCentroidsGenerator(sc=self._sc, moldb_name=mol_db.name, isocalc=isocalc) polarity = self._ds.config['isotope_generation']['charge']['polarity'] all_adducts = list(set(self._sm_config['defaults']['adducts'][polarity]) | set(DECOY_ADDUCTS)) centroids_gen.generate_if_not_exist(isocalc=isocalc, sfs=mol_db.sfs, adducts=all_adducts) target_ions = centroids_gen.ions(target_adducts) self._fdr.decoy_adducts_selection(target_ions) search_alg = MSMBasicSearch(sc=self._sc, ds=self._ds, ds_reader=self._ds_reader, mol_db=mol_db, centr_gen=centroids_gen, fdr=self._fdr, ds_config=self._ds.config) ion_metrics_df, ion_iso_images = search_alg.search() search_results = SearchResults(mol_db.id, self._job_id, search_alg.metrics.keys()) mask = self._ds_reader.get_2d_sample_area_mask() img_store_type = self._ds.get_ion_img_storage_type(self._db) search_results.store(ion_metrics_df, ion_iso_images, mask, self._db, self._img_store, img_store_type) except Exception as e: self._db.alter(JOB_UPD, params=('FAILED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id)) msg = 'Job failed(ds_id={}, mol_db={}): {}'.format(self._ds.id, mol_db, str(e)) raise JobFailedError(msg) from e else: self._export_search_results_to_es(mol_db, isocalc) def _export_search_results_to_es(self, mol_db, isocalc): try: self._es.index_ds(self._ds.id, mol_db, isocalc) except Exception as e: self._db.alter(JOB_UPD, params=('FAILED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id)) msg = 'Export to ES failed(ds_id={}, mol_db={}): {}'.format(self._ds.id, mol_db, str(e)) raise ESExportFailedError(msg) from e else: self._db.alter(JOB_UPD, params=('FINISHED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id)) def _remove_annotation_job(self, mol_db): logger.info("Removing job results ds_id: %s, ds_name: %s, db_name: %s, db_version: %s", self._ds.id, self._ds.name, mol_db.name, mol_db.version) self._db.alter('DELETE FROM job WHERE ds_id = %s and db_id = %s', params=(self._ds.id, mol_db.id)) self._es.delete_ds(self._ds.id, mol_db) def _moldb_ids(self): moldb_service = MolDBServiceWrapper(self._sm_config['services']['mol_db']) completed_moldb_ids = {moldb_service.find_db_by_id(db_id)['id'] for (_, db_id) in self._db.select(JOB_ID_MOLDB_ID_SEL, params=(self._ds.id,))} new_moldb_ids = {moldb_service.find_db_by_name_version(moldb_name)[0]['id'] for moldb_name in self._ds.config['databases']} return completed_moldb_ids, new_moldb_ids def _save_data_from_raw_ms_file(self): ms_file_type_config = SMConfig.get_ms_file_handler(self._wd_manager.local_dir.ms_file_path) acq_geometry_factory_module = ms_file_type_config['acq_geometry_factory'] acq_geometry_factory = getattr(import_module(acq_geometry_factory_module['path']), acq_geometry_factory_module['name']) acq_geometry = acq_geometry_factory(self._wd_manager.local_dir.ms_file_path).create() self._ds.save_acq_geometry(self._db, acq_geometry) self._ds.save_ion_img_storage_type(self._db, ms_file_type_config['img_storage_type']) def run(self, ds): """ Entry point of the engine. Molecule search is completed in several steps: * Copying input data to the engine work dir * Conversion input mass spec files to plain text format. One line - one spectrum data * Generation and saving to the database theoretical peaks for all formulas from the molecule database * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner. * Saving results (isotope images and their metrics of quality for each putative molecule) to the database Args ---- ds : sm.engine.dataset_manager.Dataset """ try: start = time.time() self._init_db() self._es = ESExporter(self._db) self._ds = ds if self._sm_config['rabbitmq']: self._status_queue = QueuePublisher(config=self._sm_config['rabbitmq'], qdesc=SM_DS_STATUS, logger=logger) else: self._status_queue = None ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.STARTED) self._wd_manager = WorkDirManager(ds.id) self._configure_spark() if not self.no_clean: self._wd_manager.clean() self._ds_reader = DatasetReader(self._ds.input_path, self._sc, self._wd_manager) self._ds_reader.copy_convert_input_data() self._save_data_from_raw_ms_file() self._img_store.storage_type = self._ds.get_ion_img_storage_type(self._db) ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.STARTED) logger.info('Dataset config:\n%s', pformat(self._ds.config)) completed_moldb_ids, new_moldb_ids = self._moldb_ids() for moldb_id in completed_moldb_ids.symmetric_difference(new_moldb_ids): # ignore ids present in both sets mol_db = MolecularDB(id=moldb_id, db=self._db, iso_gen_config=self._ds.config['isotope_generation']) if moldb_id not in new_moldb_ids: self._remove_annotation_job(mol_db) elif moldb_id not in completed_moldb_ids: self._run_annotation_job(mol_db) ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.FINISHED) logger.info("All done!") time_spent = time.time() - start logger.info('Time spent: %d mins %d secs', *divmod(int(round(time_spent)), 60)) except Exception as e: if self._ds: ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.FAILED) logger.error(e, exc_info=True) raise finally: if self._sc: self._sc.stop() if self._db: self._db.close() if self._wd_manager and not self.no_clean: self._wd_manager.clean() logger.info('*' * 150)
class SciTester(object): def __init__(self, sm_config_path): self.sm_config_path = sm_config_path self.sm_config = SMConfig.get_conf() self.db = DB(self.sm_config['db']) self.ds_id = '2000-01-01-00_00_00' self.base_search_res_path = join(proj_root(), 'tests/reports', 'spheroid_untreated_search_res.csv') self.ds_name = 'sci_test_spheroid_untreated' self.data_dir_path = join(self.sm_config['fs']['base_path'], self.ds_name) self.input_path = join(proj_root(), 'tests/data/untreated') self.ds_config_path = join(self.input_path, 'config.json') self.metrics = ['chaos', 'spatial', 'spectral'] def metr_dict_to_array(self, metr_d): metric_array = np.array([metr_d[m] for m in self.metrics]) return np.hstack([metric_array, metric_array.prod()]) def read_base_search_res(self): def prep_metric_arrays(a): return np.array(a, dtype=float) with open(self.base_search_res_path) as f: rows = map(lambda line: line.strip('\n').split('\t'), f.readlines()[1:]) return {(r[0], r[1]): prep_metric_arrays(r[2:]) for r in rows} def fetch_search_res(self): mol_db_service = MolDBServiceWrapper(self.sm_config['services']['mol_db']) mol_db_id = mol_db_service.find_db_by_name_version('HMDB-v2.5')[0]['id'] rows = self.db.select(SEARCH_RES_SELECT, params=(mol_db_id, self.ds_name)) return {(r[0], r[1]): self.metr_dict_to_array(r[2]) for r in rows} def save_sci_test_report(self): with open(self.base_search_res_path, 'w') as f: f.write('\t'.join(['sf', 'adduct'] + self.metrics) + '\n') for (sf, adduct), metrics in sorted(self.fetch_search_res().items()): f.write('\t'.join([sf, adduct] + metrics.astype(str).tolist()) + '\n') print('Successfully saved sample dataset search report') @staticmethod def print_metric_hist(metric_arr, bins=10): metric_freq, metric_interv = np.histogram(metric_arr, bins=np.linspace(-1, 1, 21)) metric_interv = [round(x, 2) for x in metric_interv] pprint(list(zip(zip(metric_interv[:-1], metric_interv[1:]), metric_freq))) def report_metric_differences(self, metrics_array): metrics_array = np.array(metrics_array) print("\nCHAOS HISTOGRAM") self.print_metric_hist(metrics_array[:, 0]) print("\nIMG_CORR HISTOGRAM") self.print_metric_hist(metrics_array[:, 1]) print("\nPAT_MATCH HISTOGRAM") self.print_metric_hist(metrics_array[:, 2]) print("\nMSM HISTOGRAM") self.print_metric_hist(metrics_array[:, 3]) def _missed_formulas(self, old, new): missed_sf_adduct = set(old.keys()) - set(new.keys()) print('MISSED FORMULAS: {:.1f}%'.format(len(missed_sf_adduct) / len(old) * 100)) if missed_sf_adduct: missed_sf_base_metrics = np.array([old[k] for k in missed_sf_adduct]) self.report_metric_differences(missed_sf_base_metrics) return bool(missed_sf_adduct) def _false_discovery(self, old, new): new_sf_adduct = set(new.keys()) - set(old.keys()) print('\nFALSE DISCOVERY: {:.1f}%'.format(len(new_sf_adduct) / len(old) * 100)) if new_sf_adduct: for sf_adduct in new_sf_adduct: metrics = new[sf_adduct] print('{} metrics = {}'.format(sf_adduct, metrics)) return bool(new_sf_adduct) def _metrics_diff(self, old, new): print('\nDIFFERENCE IN METRICS:') metric_diffs = [] for b_sf_add, b_metr in old.items(): if b_sf_add in new.keys(): metr = new[b_sf_add] diff = b_metr - metr if np.any(np.abs(diff) > 1e-6): metric_diffs.append(diff) print('{} metrics diff = {}'.format(b_sf_add, diff)) if metric_diffs: self.report_metric_differences(metric_diffs) return bool(metric_diffs) def search_results_are_different(self): old_search_res = self.read_base_search_res() search_res = self.fetch_search_res() return (self._missed_formulas(old_search_res, search_res) or self._false_discovery(old_search_res, search_res) or self._metrics_diff(old_search_res, search_res)) def _create_img_store_mock(self): class ImageStoreMock(object): def post_image(self, *args): return None def delete_image_by_id(self, *args): return None return ImageStoreMock() def run_search(self, mock_img_store=False): if mock_img_store: img_store = self._create_img_store_mock() else: img_store = ImageStoreServiceWrapper(self.sm_config['services']['img_service_url']) ds_man = SMDaemonDatasetManager(db=self.db, es=ESExporter(self.db), img_store=img_store, mode='local') ds = create_ds_from_files(self.ds_id, self.ds_name, self.input_path) from sm.engine import SearchJob ds_man.add(ds, search_job_factory=SearchJob, del_first=True) def clear_data_dirs(self): with warn_only(): local('rm -rf {}'.format(self.data_dir_path))
def __init__(self, db_config): self.db = DB(db_config) self.base_search_res_path = join(proj_root(), 'tests/reports', 'spheroid_12h_search_res.csv') self.metrics = ['chaos', 'spatial', 'spectral']
"JOIN agg_formula f ON f.id = m.sf_id AND sf_db.id = f.db_id " "JOIN job j ON j.id = m.job_id " "JOIN dataset ds ON ds.id = j.ds_id " "JOIN theor_peaks tp ON tp.db_id = sf_db.id AND tp.sf_id = m.sf_id AND tp.adduct = m.adduct " "WHERE sf_db.name = %s AND ds.name = %s " "AND ROUND(sigma::numeric, 6) = %s AND charge = %s AND pts_per_mz = %s") if __name__ == "__main__": parser = argparse.ArgumentParser(description='Exporting search results into a csv file') parser.add_argument('ds_name', type=str, help='Dataset name') parser.add_argument('csv_path', type=str, help='Path for the csv file') parser.add_argument('--config', dest='sm_config_path', type=str, help='SM config path') parser.set_defaults(sm_config_path=path.join(proj_root(), 'conf/config.json')) args = parser.parse_args() SMConfig.set_path(args.sm_config_path) db = DB(SMConfig.get_conf()['db']) ds_config = db.select_one(DS_CONFIG_SEL, args.ds_name)[0] isotope_gen_config = ds_config['isotope_generation'] charge = '{}{}'.format(isotope_gen_config['charge']['polarity'], isotope_gen_config['charge']['n_charges']) export_rs = db.select(EXPORT_SEL, ds_config['database']['name'], args.ds_name, isotope_gen_config['isocalc_sigma'], charge, isotope_gen_config['isocalc_pts_per_mz']) header = ','.join(['formula_db', 'ds_name', 'sf', 'adduct', 'chaos', 'img_corr', 'pat_match', 'isocalc_sigma', 'isocalc_charge', 'isocalc_pts_per_mz', 'first_peak_mz']) + '\n' with open(args.csv_path, 'w') as f: f.write(header) f.writelines([','.join(map(str, row)) + '\n' for row in export_rs]) logger.info('Exported all search results for "%s" dataset into "%s" file', args.ds_name, args.csv_path)
class TheorPeaksGenerator(object): """ Generator of theoretical isotope peaks for all molecules in a database. Args ---------- sc : pyspark.SparkContext sm_config : dict SM engine config ds_config : dict Dataset config """ def __init__(self, sc, sm_config, ds_config): # TODO: replace sm_config with db self.sc = sc self.sm_config = sm_config self.ds_config = ds_config self.theor_peaks_tmp_dir = join(sm_config['fs']['base_path'], 'tmp_theor_peaks_gen') self.db = DB(sm_config['db']) self.adducts = self.ds_config['isotope_generation']['adducts'] self.isocalc_wrapper = IsocalcWrapper(self.ds_config['isotope_generation']) @staticmethod def _sf_elements(sf): return [seg.element().name() for seg in parseSumFormula(sf).get_segments()] @classmethod def _valid_sf_adduct(cls, sf, adduct): if sf is None or adduct is None or sf == 'None' or adduct == 'None': logger.warning('Invalid sum formula or adduct: sf=%s, adduct=%s', sf, adduct) return False if '-' in adduct and adduct.strip('-') not in cls._sf_elements(sf): logger.info('No negative adduct element in the sum formula: sf=%s, adduct=%s', sf, adduct) return False return True def run(self): """ Starts peaks generation. Checks all formula peaks saved in the database and generates peaks only for new ones""" logger.info('Running theoretical peaks generation') db_id = self.db.select_one(DB_ID_SEL, self.ds_config['database']['name'])[0] formula_list = self.apply_database_filters(self.db.select(AGG_FORMULA_SEL, db_id)) stored_sf_adduct = self.db.select(SF_ADDUCT_SEL, db_id, self.isocalc_wrapper.sigma, self.isocalc_wrapper.charge, self.isocalc_wrapper.pts_per_mz) sf_adduct_cand = self.find_sf_adduct_cand(formula_list, set(stored_sf_adduct)) logger.info('%d saved (sf, adduct)s, %s not saved (sf, adduct)s', len(stored_sf_adduct), len(sf_adduct_cand)) if sf_adduct_cand: self.generate_theor_peaks(sf_adduct_cand) def apply_database_filters(self, formula_list): """ Filters according to settings in dataset config Args ---- formula_list : list List of pairs (id, sum formula) to search through Returns ------- : list Filtered list of pairs (id, sum formula) """ if 'organic' in map(lambda s: s.lower(), self.ds_config['database'].get('filters', [])): logger.info('Organic sum formula filter has been applied') return filter(lambda (_, sf): 'C' in self._sf_elements(sf), formula_list) return formula_list def find_sf_adduct_cand(self, formula_list, stored_sf_adduct): """ Args ---- formula_list : list List of pairs (id, sum formula) to search through stored_sf_adduct : set Set of (formula, adduct) pairs which have theoretical patterns saved in the database Returns ------- : list List of (formula id, formula, adduct) triples which don't have theoretical patterns saved in the database """ assert formula_list, 'Emtpy agg_formula table!' adducts = set(self.adducts) | set(DECOY_ADDUCTS) cand = [(id, sf, a) for (id, sf) in formula_list for a in adducts] return filter(lambda (sf_id, sf, adduct): (sf, adduct) not in stored_sf_adduct, cand) def generate_theor_peaks(self, sf_adduct_cand): """ Args ---- sf_adduct_cand : list List of (formula id, formula, adduct) triples which don't have theoretical patterns saved in the database Returns ------- : list List of strings with formatted theoretical peaks data """ logger.info('Generating missing peaks') formatted_iso_peaks = self.isocalc_wrapper.formatted_iso_peaks db_id = self.db.select_one(DB_ID_SEL, self.ds_config['database']['name'])[0] n = 10000 for i in xrange(0, len(sf_adduct_cand), n): sf_adduct_cand_rdd = self.sc.parallelize(sf_adduct_cand[i:i+n], numSlices=128) peak_lines = (sf_adduct_cand_rdd .flatMap(lambda (sf_id, sf, adduct): formatted_iso_peaks(db_id, sf_id, sf, adduct)) .collect()) self._import_theor_peaks_to_db(peak_lines) def _import_theor_peaks_to_db(self, peak_lines): logger.info('Saving new peaks to the DB') if not exists(self.theor_peaks_tmp_dir): makedirs(self.theor_peaks_tmp_dir) peak_lines_path = join(self.theor_peaks_tmp_dir, 'peak_lines.csv') with open(peak_lines_path, 'w') as f: f.write('\n'.join(peak_lines)) with open(peak_lines_path) as peaks_file: self.db.copy(peaks_file, 'theor_peaks')
def _init_db(self): logger.info('Connecting to the DB') self._db = DB(self._sm_config['db'])
def test_search_job_imzml_example(get_compute_img_metrics_mock, filter_sf_metrics_mock, post_images_to_annot_service_mock, MolDBServiceWrapperMock, MolDBServiceWrapperMock2, sm_config, create_fill_sm_database, es_dsl_search, clean_isotope_storage): init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock) init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock2) get_compute_img_metrics_mock.return_value = lambda *args: (0.9, 0.9, 0.9, [100.], [0], [10.]) filter_sf_metrics_mock.side_effect = lambda x: x url_dict = { 'iso_image_ids': ['iso_image_1', None, None, None] } post_images_to_annot_service_mock.return_value = { 35: url_dict, 44: url_dict } db = DB(sm_config['db']) try: ds_config_str = open(ds_config_path).read() upload_dt = datetime.now() ds_id = '2000-01-01_00h00m' db.insert(Dataset.DS_INSERT, [{ 'id': ds_id, 'name': test_ds_name, 'input_path': input_dir_path, 'upload_dt': upload_dt, 'metadata': '{}', 'config': ds_config_str, 'status': DatasetStatus.QUEUED, 'is_public': True, 'mol_dbs': ['HMDB-v4'], 'adducts': ['+H', '+Na', '+K'], 'ion_img_storage': 'fs' }]) img_store = ImageStoreServiceWrapper(sm_config['services']['img_service_url']) job = SearchJob(img_store=img_store) job._sm_config['rabbitmq'] = {} # avoid talking to RabbitMQ during the test ds = Dataset.load(db, ds_id) job.run(ds) # dataset table asserts rows = db.select('SELECT id, name, input_path, upload_dt, status from dataset') input_path = join(dirname(__file__), 'data', test_ds_name) assert len(rows) == 1 assert rows[0] == (ds_id, test_ds_name, input_path, upload_dt, DatasetStatus.FINISHED) # ms acquisition geometry asserts rows = db.select('SELECT acq_geometry from dataset') assert len(rows) == 1 assert rows[0][0] == ds.get_acq_geometry(db) assert rows[0][0] == { ACQ_GEOMETRY_KEYS.LENGTH_UNIT: 'nm', ACQ_GEOMETRY_KEYS.AcqGridSection.section_name: { ACQ_GEOMETRY_KEYS.AcqGridSection.REGULAR_GRID: True, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_X : 3, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_Y : 3, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_X : 100, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_Y : 100 }, ACQ_GEOMETRY_KEYS.PixelSizeSection.section_name: { ACQ_GEOMETRY_KEYS.PixelSizeSection.REGULAR_SIZE: True, ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_X : 100, ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_Y : 100 } } # job table asserts rows = db.select('SELECT db_id, ds_id, status, start, finish from job') assert len(rows) == 1 db_id, ds_id, status, start, finish = rows[0] assert (db_id, ds_id, status) == (0, '2000-01-01_00h00m', 'FINISHED') assert start < finish # image metrics asserts rows = db.select(('SELECT db_id, sf, adduct, stats, iso_image_ids ' 'FROM iso_image_metrics ' 'ORDER BY sf, adduct')) assert rows[0] == (0, 'C12H24O', '+K', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9, 'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]}, ['iso_image_1', None, None, None]) assert rows[1] == (0, 'C12H24O', '+Na', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9, 'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]}, ['iso_image_1', None, None, None]) time.sleep(1) # Waiting for ES # ES asserts ds_docs = es_dsl_search.query('term', _type='dataset').execute().to_dict()['hits']['hits'] assert 1 == len(ds_docs) ann_docs = es_dsl_search.query('term', _type='annotation').execute().to_dict()['hits']['hits'] assert len(ann_docs) == len(rows) for doc in ann_docs: assert doc['_id'].startswith(ds_id) finally: db.close() with warn_only(): local('rm -rf {}'.format(data_dir_path))
def fin(): db_config = dict(database='postgres', user='******', host='localhost', password='******') db = DB(db_config, autocommit=True) db.alter('DROP DATABASE IF EXISTS sm_test') db.close()