def test_save_ds_meta_ds_doesnt_exist(spark_context, create_test_db, drop_test_db, sm_config, ds_config): work_dir_man_mock = MagicMock(WorkDirManager) work_dir_man_mock.ds_coord_path = '/ds_path' work_dir_man_mock.txt_path = '/txt_path' SMConfig._config_dict = sm_config with patch('sm.engine.tests.util.SparkContext.textFile') as m: m.return_value = spark_context.parallelize(['0,1,1\n', '1,100,200\n']) dataset = Dataset(spark_context, 'ds_name', '', 'input_path', ds_config, work_dir_man_mock, DB(sm_config['db'])) dataset.save_ds_meta() db = DB(sm_config['db']) ds_row = db.select_one( 'SELECT name, file_path, img_bounds, config from dataset') assert ds_row == ('ds_name', 'input_path', { u'x': { u'min': 1, u'max': 100 }, u'y': { u'min': 1, u'max': 200 } }, ds_config) coord_row = db.select_one('SELECT xs, ys from coordinates') assert coord_row == ([1, 100], [1, 200]) db.close()
def test_save_ds_meta_ds_doesnt_exist(spark_context, create_test_db, drop_test_db, sm_config, ds_config): work_dir_man_mock = MagicMock(WorkDirManager) work_dir_man_mock.ds_coord_path = '/ds_path' work_dir_man_mock.txt_path = '/txt_path' SMConfig._config_dict = sm_config with patch('sm.engine.tests.util.SparkContext.textFile') as m: m.return_value = spark_context.parallelize([ '0,1,1\n', '1,100,200\n']) dataset = Dataset(spark_context, 'ds_name', '', ds_config, work_dir_man_mock, DB(sm_config['db'])) dataset.save_ds_meta() db = DB(sm_config['db']) ds_row = db.select_one('SELECT name, file_path, img_bounds, config from dataset') assert ds_row == ('ds_name', '/txt_path', {u'x': {u'min': 1, u'max': 100}, u'y': {u'min': 1, u'max': 200}}, ds_config) coord_row = db.select_one('SELECT xs, ys from coordinates') assert coord_row == ([1, 100], [1, 200]) db.close()
def test_sm_daemons_annot_fails(get_compute_img_metrics_mock, filter_sf_metrics_mock, post_images_to_annot_service_mock, MolDBServiceWrapperMock, sm_config, test_db, es_dsl_search, clean_isotope_storage): init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock) def throw_exception_function(*args): raise Exception('Test') get_compute_img_metrics_mock.return_value = throw_exception_function filter_sf_metrics_mock.side_effect = lambda x: x url_dict = { 'iso_image_ids': ['iso_image_1', None, None, None] } post_images_to_annot_service_mock.return_value = { 35: url_dict, 44: url_dict } db = DB(sm_config['db']) es = ESExporter(db) annotate_daemon = None try: ds_id = '2000-01-01_00h00m' upload_dt = datetime.now() ds_config_str = open(ds_config_path).read() db.insert(Dataset.DS_INSERT, [{ 'id': ds_id, 'name': test_ds_name, 'input_path': input_dir_path, 'upload_dt': upload_dt, 'metadata': '{}', 'config': ds_config_str, 'status': DatasetStatus.QUEUED, 'is_public': True, 'mol_dbs': ['HMDB-v4'], 'adducts': ['+H', '+Na', '+K'], 'ion_img_storage': 'fs' }]) ds = Dataset.load(db, ds_id) queue_pub.publish({'ds_id': ds.id, 'ds_name': ds.name, 'action': 'annotate'}) run_daemons(db, es) # dataset and job tables asserts row = db.select_one('SELECT status from dataset') assert row[0] == 'FAILED' row = db.select_one('SELECT status from job') assert row[0] == 'FAILED' finally: db.close() if annotate_daemon: annotate_daemon.stop() with warn_only(): local('rm -rf {}'.format(data_dir_path))
def test_search_job_imzml_example_es_export_fails(get_compute_img_metrics_mock, filter_sf_metrics_mock, post_images_to_annot_service_mock, MolDBServiceWrapperMock, MolDBServiceWrapperMock2, sm_config, create_fill_sm_database, es_dsl_search, clean_isotope_storage): init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock) init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock2) get_compute_img_metrics_mock.return_value = lambda *args: (0.9, 0.9, 0.9, [100.], [0], [10.]) filter_sf_metrics_mock.side_effect = lambda x: x url_dict = { 'iso_image_ids': ['iso_image_1', None, None, None] } post_images_to_annot_service_mock.return_value = { 35: url_dict, 44: url_dict } db = DB(sm_config['db']) def throw_exception_function(*args): raise Exception('Test') try: ds_id = '2000-01-01_00h00m' upload_dt = datetime.now() ds_config_str = open(ds_config_path).read() db.insert(Dataset.DS_INSERT, [{ 'id': ds_id, 'name': test_ds_name, 'input_path': input_dir_path, 'upload_dt': upload_dt, 'metadata': '{}', 'config': ds_config_str, 'status': DatasetStatus.QUEUED, 'is_public': True, 'mol_dbs': ['HMDB-v4'], 'adducts': ['+H', '+Na', '+K'], 'ion_img_storage': 'fs' }]) with patch('sm.engine.search_job.ESExporter.index_ds') as index_ds_mock: index_ds_mock.side_effect = throw_exception_function img_store = ImageStoreServiceWrapper(sm_config['services']['img_service_url']) job = SearchJob(img_store=img_store) ds = Dataset.load(db, ds_id) job.run(ds) except ESExportFailedError as e: assert e # dataset table asserts row = db.select_one('SELECT status from dataset') assert row[0] == 'FAILED' else: raise AssertionError('ESExportFailedError should be raised') finally: db.close() with warn_only(): local('rm -rf {}'.format(data_dir_path))
def update_optical_images(ds_id_str, sql_where): db = DB() if ds_id_str: ds_ids = ds_id_str.split(',') else: ds_ids = [ id for (id, ) in db.select( f'SELECT DISTINCT dataset.id FROM dataset WHERE {sql_where}') ] for i, ds_id in enumerate(ds_ids): try: transform, img_id = db.select_one( 'SELECT transform, optical_image from dataset WHERE id = %s', params=(ds_id, )) if img_id and transform: logger.info( f'[{i + 1}/{len(ds_ids)}] Updating optical image of dataset {ds_id}' ) add_optical_image(db, ds_id, img_id, transform) else: logger.info( f'[{i + 1}/{len(ds_ids)}] Skipping dataset {ds_id}') except Exception: logger.error(f'Failed to update optical image on {ds_id}', exc_info=True)
def migrate_optical_images(ds_id): output.print('Migrating optical images') with timeit(): output.print('Transferring images and updating database...') db = DB() rows = db.select(SEL_OPTICAL_IMGS, params=(ds_id,)) for opt_image_id, opt_image_url in rows: if not opt_image_url and opt_image_id: transfer_images( ds_id, 'optical_images', image_storage.OPTICAL, [opt_image_id], ) opt_image_url = image_storage.get_image_url( image_storage.OPTICAL, ds_id, opt_image_id ) db.alter(UPD_OPTICAL_IMGS, params=(opt_image_url, opt_image_id)) opt_thumb_id, opt_thumb_url = db.select_one(SEL_OPT_THUMB, params=(ds_id,)) if not opt_thumb_url and opt_thumb_id: transfer_images( ds_id, 'optical_images', image_storage.OPTICAL, [opt_thumb_id], ) opt_thumb_url = image_storage.get_image_url(image_storage.OPTICAL, ds_id, opt_thumb_id) db.alter(UPD_OPT_THUMB, params=(opt_thumb_url, ds_id))
def test_delete_ds(self, EsMock, fill_db): db = DB() manager = create_daemon_man(db=db, es=EsMock()) ds_id = '2000-01-01' ds = create_ds(ds_id=ds_id) manager.delete(ds) EsMock.return_value.delete_ds.assert_has_calls([call(ds_id)]) assert db.select_one('SELECT * FROM dataset WHERE id = %s', params=(ds_id,)) == []
def test_create_moldb_malformed_csv(file, fill_db): input_doc = moldb_input_doc(file_path=f's3://{BUCKET_NAME}/{file.value}') with patch_bottle_request(input_doc): resp = api.databases.create() assert resp['status'] == MALFORMED_CSV['status'] assert resp['error'] db = DB() (db_count, ) = db.select_one(MOLDB_COUNT_SEL) assert db_count == 0
def test_delete_moldb(fill_db): input_doc = moldb_input_doc( file_path=f's3://{BUCKET_NAME}/{MoldbFiles.VALID.value}') moldb = create_test_molecular_db(**input_doc) with patch_bottle_request(req_doc={}): resp = api.databases.delete(moldb_id=moldb.id) assert resp['status'] == 'success' db = DB() (db_count, ) = db.select_one(MOLDB_COUNT_SEL) assert db_count == 0
def test_create_moldb_empty_values(fill_db): input_doc = moldb_input_doc( file_path=f's3://{BUCKET_NAME}/{MoldbFiles.EMPTY_VALUES.value}') with patch_bottle_request(input_doc): resp = api.databases.create() assert resp['status'] == BAD_DATA['status'] assert resp['error'] and resp['details'] db = DB() (db_count, ) = db.select_one(MOLDB_COUNT_SEL) assert db_count == 0
def save_additional_info_to_db(db_id, user_id, input_path): conf = SMConfig.get_conf() with ConnectionPool(conf['db']): db = DB() if db.select_one('SELECT * FROM molecular_db WHERE id = %s', (db_id, )): print(f'Updating existing molecular database {db_id}') DB().alter( 'UPDATE molecular_db SET user_id = %s, input_path = %s WHERE id = %s', (user_id, input_path, db_id), ) else: print(f'Specified molecular database {db_id} does not exist.')
def test_sm_daemons_annot_fails( MSMSearchMock, post_images_to_image_store_mock, test_db, es_dsl_search, clean_isotope_storage, reset_queues, metadata, ds_config, queue_pub, local_sm_config, ): moldb = init_moldb() def throw_exception_function(*args, **kwargs): raise Exception('Test exception') msm_algo_mock = MSMSearchMock() msm_algo_mock.search.side_effect = throw_exception_function image_ids = ['iso_image_1', None, None, None] post_images_to_image_store_mock.return_value = { 0: image_ids, 1: image_ids, 2: image_ids } db = DB() es = ESExporter(db, local_sm_config) ds = create_test_ds( name=test_ds_name, input_path=input_dir_path, config={ **ds_config, 'database_ids': [moldb.id] }, status=DatasetStatus.QUEUED, es=es, ) queue_pub.publish({ 'ds_id': ds.id, 'ds_name': test_ds_name, 'action': DaemonAction.ANNOTATE }) run_daemons(db, es, local_sm_config) # dataset and job tables asserts row = db.select_one('SELECT status from dataset') assert len(row) == 1 assert row[0] == 'FAILED'
def test_delete_ds(self, fill_db, sm_config, ds_config): db = DB(sm_config['db']) es_mock = MagicMock(spec=ESExporter) img_store_service_mock = MagicMock(spec=ImageStoreServiceWrapper) manager = create_daemon_man(sm_config, db=db, es=es_mock, img_store=img_store_service_mock) ds_id = '2000-01-01' ds = create_ds(ds_id=ds_id, ds_config=ds_config) manager.delete(ds) ids = ['iso_image_{}_id'.format(id) for id in range(1, 3)] img_store_service_mock.delete_image_by_id.assert_has_calls( [call('fs', 'iso_image', ids[0]), call('fs', 'iso_image', ids[1])]) es_mock.delete_ds.assert_called_with(ds_id) assert db.select_one('SELECT * FROM dataset WHERE id = %s', params=(ds_id,)) == []
def migrate_ion_thumbnail(ds_id): output.print('Migrating ion thumbnail images') with timeit(): output.print('Transferring images and updating database...') db = DB() ion_thumb_id, ion_thumbnail_url = db.select_one(SEL_ION_THUMB, params=(ds_id,)) if not ion_thumbnail_url and ion_thumb_id: transfer_images( ds_id, 'ion_thumbnails', image_storage.THUMB, [ion_thumb_id], ) ion_thumb_url = image_storage.get_image_url(image_storage.THUMB, ds_id, ion_thumb_id) db.alter(UPD_ION_THUMB, params=(ion_thumb_url, ds_id))
def test_create_moldb_wrong_formulas(fill_db): input_doc = moldb_input_doc( file_path=f's3://{BUCKET_NAME}/{MoldbFiles.WRONG_FORMULAS.value}') with patch_bottle_request(input_doc): resp = api.databases.create() assert resp['status'] == BAD_DATA['status'] assert resp['error'], resp['details'] for err_row in resp['details']: assert all([ err_row.get(err_field, None) for err_field in ['line', 'row', 'error'] ]) db = DB() (db_count, ) = db.select_one(MOLDB_COUNT_SEL) assert db_count == 0
def test_creates_ion_thumbnail(test_db, algorithm, metadata, ds_config): db = DB() ds = _make_fake_ds(db, metadata, ds_config) with patch('sm.engine.postprocessing.ion_thumbnail.image_storage' ) as image_storage_mock: image_storage_mock.post_image.return_value = IMG_ID image_storage_mock.get_image_url.return_value = IMG_URL image_storage_mock.get_ion_images_for_analysis.side_effect = ( _mock_get_ion_images_for_analysis) generate_ion_thumbnail(db, ds, algorithm=algorithm) ion_thumbnail, ion_thumbnail_url = db.select_one( "SELECT ion_thumbnail, ion_thumbnail_url FROM dataset WHERE id = %s", [ds.id]) assert ion_thumbnail == IMG_ID assert ion_thumbnail_url == IMG_URL assert image_storage_mock.post_image.called
def test_annotate_ds(self, test_db, sm_config, ds_config): es_mock = MagicMock(spec=ESExporter) db = DB(sm_config['db']) try: manager = create_daemon_man(sm_config, db=db, es=es_mock) ds_id = '2000-01-01' ds_name = 'ds_name' input_path = 'input_path' upload_dt = datetime.now() metadata = {} ds = create_ds(ds_id=ds_id, ds_name=ds_name, input_path=input_path, upload_dt=upload_dt, metadata=metadata, ds_config=ds_config) manager.annotate(ds, search_job_factory=self.SearchJob) DS_SEL = 'select name, input_path, upload_dt, metadata, config from dataset where id=%s' assert db.select_one(DS_SEL, params=(ds_id,)) == (ds_name, input_path, upload_dt, metadata, ds_config) finally: db.close()
def run(ds_id, sql_where): conf = SMConfig.get_conf() db = DB(conf['db']) img_store = ImageStoreServiceWrapper(conf['services']['img_service_url']) if sql_where: ds_ids = [ id for (id,) in db.select(f'SELECT DISTINCT dataset.id FROM dataset WHERE {sql_where}') ] else: ds_ids = ds_id.split(',') if not ds_ids: logger.warning('No datasets match filter') return for i, ds_id in enumerate(ds_ids): try: logger.info(f'[{i+1} / {len(ds_ids)}] Updating acq geometry for {ds_id}') ds = Dataset.load(db, ds_id) (sample_img_id,) = db.select_one( "SELECT iim.iso_image_ids[1] from job j " "JOIN iso_image_metrics iim on j.id = iim.job_id " "WHERE j.ds_id = %s LIMIT 1", [ds_id], ) print(sample_img_id) if sample_img_id: w, h = img_store.get_image_by_id('fs', 'iso_image', sample_img_id).size dims = (h, w) # n_cols, n_rows else: dims = (None, None) acq_geometry = make_acq_geometry('ims', None, ds.metadata, dims) ds.save_acq_geometry(db, acq_geometry) except Exception: logger.error(f'Failed on {ds_id}', exc_info=True)
def save_scoring_model_to_db(name, type_, params): """Adds/updates the scoring_model in the local database""" # Import DB locally so that Lithops doesn't try to pickle it & fail due to psycopg2 # pylint: disable=import-outside-toplevel # circular import from sm.engine.db import DB if not isinstance(params, str): params = json.dumps(params) db = DB() if db.select_one('SELECT * FROM scoring_model WHERE name = %s', (name, )): logger.info(f'Updating existing scoring model {name}') DB().alter( 'UPDATE scoring_model SET type = %s, params = %s WHERE name = %s', (type_, params, name), ) else: logger.info(f'Inserting new scoring model {name}') DB().alter( 'INSERT INTO scoring_model(name, type, params) VALUES (%s, %s, %s)', (name, type_, params), )
def test_annotate_ds(self, AnnotationJobMock, fill_db, metadata, ds_config): es_mock = MagicMock(spec=ESExporter) db = DB() manager = create_daemon_man(db=db, es=es_mock) ds_id = '2000-01-01' ds_name = 'ds_name' input_path = 'input_path' upload_dt = datetime.now() ds = create_ds( ds_id=ds_id, ds_name=ds_name, input_path=input_path, upload_dt=upload_dt, metadata=metadata, ) manager.annotate(ds) DS_SEL = 'select name, input_path, upload_dt, metadata, config from dataset where id=%s' results = db.select_one(DS_SEL, params=(ds_id,)) assert results[3] == metadata assert results[4] == ds_config
def ensure_db_populated(sm_config, analysis_version, database): db = DB() # Install DB schema if needed query = "SELECT COUNT(*) FROM pg_tables WHERE schemaname = 'public' AND tablename = 'dataset'" tables_exist = db.select_one(query)[0] >= 1 if not tables_exist: print('Installing DB schema') db.alter(DB_SQL_SCHEMA) # Import HMDB if needed moldb = MOL_DBS[database] try: molecular_db.find_by_name_version(moldb['name'], moldb['version']) except SMError: print(f'Importing {database}') with TemporaryDirectory() as tmp: urlretrieve(moldb['url'], f'{tmp}/moldb.tsv') molecular_db.create(moldb['name'], moldb['version'], f'{tmp}/moldb.tsv') if analysis_version > 1: if len( db.select( "SELECT name FROM scoring_model WHERE name = 'v3_default'") ) == 0: print("Importing v3_default scoring model") params = upload_catboost_scoring_model( model=Path(proj_root()) / '../scoring-models/v3_default/model-2022-01-05T13-45-26.947188-416b1311.cbm', bucket=sm_config['lithops']['lithops']['storage_bucket'], prefix=f'test_scoring_models/v3_default', is_public=False, ) save_scoring_model_to_db(name='v3_default', type_='catboost', params=params)
parser = argparse.ArgumentParser(description='Exporting isotopic images') parser.add_argument('ds_name', type=str, help='Dataset name') parser.add_argument('sf', type=str, help='sum formula') parser.add_argument('pkl_path', type=str, help='Path for the cPickle file') parser.add_argument('--config', dest='sm_config_path', type=str, help='SM config path') parser.set_defaults( sm_config_path=path.join(proj_root(), 'conf/config.json')) args = parser.parse_args() SMConfig.set_path(args.sm_config_path) db = DB(SMConfig.get_conf()['db']) ds_config, img_bounds = db.select_one(DS_CONFIG_SEL, args.ds_name) nrows, ncols = get_img_dims(img_bounds) isotope_gen_config = ds_config['isotope_generation'] charge = '{}{}'.format(isotope_gen_config['charge']['polarity'], isotope_gen_config['charge']['n_charges']) export_rs = db.select(EXPORT_SEL, args.ds_name, args.sf) export_df = pd.DataFrame( export_rs, columns=['sf', 'adduct', 'peak', 'pxl_inds', 'ints']) export_df['img_dims'] = [(img_bounds['y']['min'], img_bounds['y']['max'], img_bounds['x']['min'], img_bounds['x']['max']) ] * len(export_df) # export_df['img'] = export_df.apply(lambda r: build_matrix(np.array(r['pxl_inds']), # np.array(r['ints']), nrows, ncols), axis=1) # export_df.drop(['pxl_inds', 'ints'], axis=1, inplace=True) # export_df.to_csv(args.csv_path, index=False)
def test_sm_daemon_es_export_fails( MSMSearchMock, post_images_to_image_store_mock, test_db, es_dsl_search, clean_isotope_storage, reset_queues, metadata, ds_config, queue_pub, local_sm_config, ): moldb = init_moldb() formula_metrics_df = pd.DataFrame({ 'formula_i': [0, 1, 2], 'ion_formula': ['C12H24O-H2O+H', 'C12H24O-H2+O2-CO+Na', 'C12H24O+K'], 'formula': ['C12H24O', 'C12H24O', 'C12H24O'], 'modifier': ['-H2O+H', '-H2+O2-CO+Na', '+K'], 'chem_mod': ['', '-H2+O2', ''], 'neutral_loss': ['-H2O', '-CO', ''], 'adduct': ['+H', '+Na', '+K'], 'chaos': [0.9, 0.9, 0.9], 'spatial': [0.9, 0.9, 0.9], 'spectral': [0.9, 0.9, 0.9], 'msm': [0.9**3, 0.9**3, 0.9**3], 'total_iso_ints': [[100.0], [100.0], [100.0]], 'min_iso_ints': [[0], [0], [0]], 'max_iso_ints': [[10.0], [10.0], [10.0]], 'fdr': [0.1, 0.1, 0.1], }).set_index('formula_i') search_algo_mock = MSMSearchMock() search_algo_mock.search.return_value = [ (formula_metrics_df, [], create_test_fdr_diagnostics_bundle()) ] search_algo_mock.metrics = OrderedDict([ ('chaos', 0), ('spatial', 0), ('spectral', 0), ('msm', 0), ('total_iso_ints', []), ('min_iso_ints', []), ('max_iso_ints', []), ]) image_ids = ['iso_image_1', None, None, None] post_images_to_image_store_mock.return_value = { 0: image_ids, 1: image_ids, 2: image_ids } db = DB() def throw_exception_function(*args, **kwargs): raise Exception('Test') es = ESExporter(db, local_sm_config) es.index_ds = throw_exception_function ds = create_test_ds( name=test_ds_name, input_path=input_dir_path, config={ **ds_config, 'database_ids': [moldb.id] }, status=DatasetStatus.QUEUED, es=es, ) queue_pub.publish({ 'ds_id': ds.id, 'ds_name': test_ds_name, 'action': DaemonAction.ANNOTATE }) run_daemons(db, es, local_sm_config) # dataset and job tables asserts row = db.select_one('SELECT status from job') assert row[0] == 'FINISHED' row = db.select_one('SELECT status from dataset') assert row[0] == 'FAILED'
"JOIN agg_formula f ON f.id = m.sf_id AND sf_db.id = f.db_id " "JOIN job j ON j.id = m.job_id " "JOIN dataset ds ON ds.id = j.ds_id " "JOIN theor_peaks tp ON tp.db_id = sf_db.id AND tp.sf_id = m.sf_id AND tp.adduct = m.adduct " "WHERE sf_db.name = %s AND ds.name = %s " "AND ROUND(sigma::numeric, 6) = %s AND charge = %s AND pts_per_mz = %s") if __name__ == "__main__": parser = argparse.ArgumentParser(description='Exporting search results into a csv file') parser.add_argument('ds_name', type=str, help='Dataset name') parser.add_argument('csv_path', type=str, help='Path for the csv file') parser.add_argument('--config', dest='sm_config_path', type=str, help='SM config path') parser.set_defaults(sm_config_path=path.join(proj_root(), 'conf/config.json')) args = parser.parse_args() SMConfig.set_path(args.sm_config_path) db = DB(SMConfig.get_conf()['db']) ds_config = db.select_one(DS_CONFIG_SEL, args.ds_name)[0] isotope_gen_config = ds_config['isotope_generation'] charge = '{}{}'.format(isotope_gen_config['charge']['polarity'], isotope_gen_config['charge']['n_charges']) export_rs = db.select(EXPORT_SEL, ds_config['database']['name'], args.ds_name, isotope_gen_config['isocalc_sigma'], charge, isotope_gen_config['isocalc_pts_per_mz']) header = ','.join(['formula_db', 'ds_name', 'sf', 'adduct', 'chaos', 'img_corr', 'pat_match', 'isocalc_sigma', 'isocalc_charge', 'isocalc_pts_per_mz', 'first_peak_mz']) + '\n' with open(args.csv_path, 'w') as f: f.write(header) f.writelines([','.join(map(str, row)) + '\n' for row in export_rs]) logger.info('Exported all search results for "%s" dataset into "%s" file', args.ds_name, args.csv_path)
class TheorPeaksGenerator(object): """ Generator of theoretical isotope peaks for all molecules in a database. Args ---------- sc : pyspark.SparkContext sm_config : dict SM engine config ds_config : dict Dataset config """ def __init__(self, sc, sm_config, ds_config): self.sc = sc self.sm_config = sm_config self.ds_config = ds_config self.theor_peaks_tmp_dir = join(sm_config['fs']['base_path'], 'tmp_theor_peaks_gen') self.db = DB(sm_config['db']) self.adducts = self.ds_config['isotope_generation']['adducts'] self.isocalc_wrapper = IsocalcWrapper( self.ds_config['isotope_generation']) @staticmethod def _sf_elements(sf): return [ seg.element().name() for seg in parseSumFormula(sf).get_segments() ] @classmethod def _valid_sf_adduct(cls, sf, adduct): if sf is None or adduct is None or sf == 'None' or adduct == 'None': logger.warning('Invalid sum formula or adduct: sf=%s, adduct=%s', sf, adduct) return False if '-' in adduct and adduct.strip('-') not in cls._sf_elements(sf): logger.info( 'No negative adduct element in the sum formula: sf=%s, adduct=%s', sf, adduct) return False return True def run(self): """ Starts peaks generation. Checks all formula peaks saved in the database and generates peaks only for new ones""" logger.info('Running theoretical peaks generation') db_id = self.db.select_one(DB_ID_SEL, self.ds_config['database']['name'])[0] formula_list = self.apply_database_filters( self.db.select(AGG_FORMULA_SEL, db_id)) stored_sf_adduct = self.db.select(SF_ADDUCT_SEL, db_id, self.isocalc_wrapper.sigma, self.isocalc_wrapper.charge, self.isocalc_wrapper.pts_per_mz) sf_adduct_cand = self.find_sf_adduct_cand(formula_list, set(stored_sf_adduct)) logger.info('%d saved (sf, adduct)s, %s not saved (sf, adduct)s', len(stored_sf_adduct), len(sf_adduct_cand)) if sf_adduct_cand: self.generate_theor_peaks(sf_adduct_cand) def apply_database_filters(self, formula_list): """ Filters according to settings in dataset config Args ---- formula_list : list List of pairs (id, sum formula) to search through Returns ------- : list Filtered list of pairs (id, sum formula) """ if 'organic' in map(lambda s: s.lower(), self.ds_config['database'].get('filters', [])): logger.info('Organic sum formula filter has been applied') return filter(lambda (_, sf): 'C' in self._sf_elements(sf), formula_list) return formula_list def find_sf_adduct_cand(self, formula_list, stored_sf_adduct): """ Args ---- formula_list : list List of pairs (id, sum formula) to search through stored_sf_adduct : set Set of (formula, adduct) pairs which have theoretical patterns saved in the database Returns ------- : list List of (formula id, formula, adduct) triples which don't have theoretical patterns saved in the database """ assert formula_list, 'Emtpy agg_formula table!' adducts = set(self.adducts) | set(DECOY_ADDUCTS) cand = [(id, sf, a) for (id, sf) in formula_list for a in adducts] return filter( lambda (sf_id, sf, adduct): (sf, adduct) not in stored_sf_adduct, cand) def generate_theor_peaks(self, sf_adduct_cand): """ Args ---- sf_adduct_cand : list List of (formula id, formula, adduct) triples which don't have theoretical patterns saved in the database Returns ------- : list List of strings with formatted theoretical peaks data """ logger.info('Generating missing peaks') formatted_iso_peaks = self.isocalc_wrapper.formatted_iso_peaks db_id = self.db.select_one(DB_ID_SEL, self.ds_config['database']['name'])[0] n = 10000 for i in xrange(0, len(sf_adduct_cand), n): sf_adduct_cand_rdd = self.sc.parallelize(sf_adduct_cand[i:i + n], numSlices=128) peak_lines = (sf_adduct_cand_rdd.flatMap( lambda (sf_id, sf, adduct): formatted_iso_peaks( db_id, sf_id, sf, adduct)).collect()) self._import_theor_peaks_to_db(peak_lines) def _import_theor_peaks_to_db(self, peak_lines): logger.info('Saving new peaks to the DB') if not exists(self.theor_peaks_tmp_dir): makedirs(self.theor_peaks_tmp_dir) peak_lines_path = join(self.theor_peaks_tmp_dir, 'peak_lines.csv') with open(peak_lines_path, 'w') as f: f.write('\n'.join(peak_lines)) with open(peak_lines_path) as peaks_file: self.db.copy(peaks_file, 'theor_peaks')
class TheorPeaksGenerator(object): """ Generator of theoretical isotope peaks for all molecules in a database. Args ---------- sc : pyspark.SparkContext sm_config : dict SM engine config ds_config : dict Dataset config """ def __init__(self, sc, sm_config, ds_config): # TODO: replace sm_config with db self.sc = sc self.sm_config = sm_config self.ds_config = ds_config self.theor_peaks_tmp_dir = join(sm_config['fs']['base_path'], 'tmp_theor_peaks_gen') self.db = DB(sm_config['db']) self.adducts = self.ds_config['isotope_generation']['adducts'] self.isocalc_wrapper = IsocalcWrapper(self.ds_config['isotope_generation']) @staticmethod def _sf_elements(sf): return [seg.element().name() for seg in parseSumFormula(sf).get_segments()] @classmethod def _valid_sf_adduct(cls, sf, adduct): if sf is None or adduct is None or sf == 'None' or adduct == 'None': logger.warning('Invalid sum formula or adduct: sf=%s, adduct=%s', sf, adduct) return False if '-' in adduct and adduct.strip('-') not in cls._sf_elements(sf): logger.info('No negative adduct element in the sum formula: sf=%s, adduct=%s', sf, adduct) return False return True def run(self): """ Starts peaks generation. Checks all formula peaks saved in the database and generates peaks only for new ones""" logger.info('Running theoretical peaks generation') db_id = self.db.select_one(DB_ID_SEL, self.ds_config['database']['name'])[0] formula_list = self.apply_database_filters(self.db.select(AGG_FORMULA_SEL, db_id)) stored_sf_adduct = self.db.select(SF_ADDUCT_SEL, db_id, self.isocalc_wrapper.sigma, self.isocalc_wrapper.charge, self.isocalc_wrapper.pts_per_mz) sf_adduct_cand = self.find_sf_adduct_cand(formula_list, set(stored_sf_adduct)) logger.info('%d saved (sf, adduct)s, %s not saved (sf, adduct)s', len(stored_sf_adduct), len(sf_adduct_cand)) if sf_adduct_cand: self.generate_theor_peaks(sf_adduct_cand) def apply_database_filters(self, formula_list): """ Filters according to settings in dataset config Args ---- formula_list : list List of pairs (id, sum formula) to search through Returns ------- : list Filtered list of pairs (id, sum formula) """ if 'organic' in map(lambda s: s.lower(), self.ds_config['database'].get('filters', [])): logger.info('Organic sum formula filter has been applied') return filter(lambda (_, sf): 'C' in self._sf_elements(sf), formula_list) return formula_list def find_sf_adduct_cand(self, formula_list, stored_sf_adduct): """ Args ---- formula_list : list List of pairs (id, sum formula) to search through stored_sf_adduct : set Set of (formula, adduct) pairs which have theoretical patterns saved in the database Returns ------- : list List of (formula id, formula, adduct) triples which don't have theoretical patterns saved in the database """ assert formula_list, 'Emtpy agg_formula table!' adducts = set(self.adducts) | set(DECOY_ADDUCTS) cand = [(id, sf, a) for (id, sf) in formula_list for a in adducts] return filter(lambda (sf_id, sf, adduct): (sf, adduct) not in stored_sf_adduct, cand) def generate_theor_peaks(self, sf_adduct_cand): """ Args ---- sf_adduct_cand : list List of (formula id, formula, adduct) triples which don't have theoretical patterns saved in the database Returns ------- : list List of strings with formatted theoretical peaks data """ logger.info('Generating missing peaks') formatted_iso_peaks = self.isocalc_wrapper.formatted_iso_peaks db_id = self.db.select_one(DB_ID_SEL, self.ds_config['database']['name'])[0] n = 10000 for i in xrange(0, len(sf_adduct_cand), n): sf_adduct_cand_rdd = self.sc.parallelize(sf_adduct_cand[i:i+n], numSlices=128) peak_lines = (sf_adduct_cand_rdd .flatMap(lambda (sf_id, sf, adduct): formatted_iso_peaks(db_id, sf_id, sf, adduct)) .collect()) self._import_theor_peaks_to_db(peak_lines) def _import_theor_peaks_to_db(self, peak_lines): logger.info('Saving new peaks to the DB') if not exists(self.theor_peaks_tmp_dir): makedirs(self.theor_peaks_tmp_dir) peak_lines_path = join(self.theor_peaks_tmp_dir, 'peak_lines.csv') with open(peak_lines_path, 'w') as f: f.write('\n'.join(peak_lines)) with open(peak_lines_path) as peaks_file: self.db.copy(peaks_file, 'theor_peaks')
if __name__ == "__main__": parser = argparse.ArgumentParser( description='Exporting search results into a csv file') parser.add_argument('ds_name', type=str, help='Dataset name') parser.add_argument('csv_path', type=str, help='Path for the csv file') parser.add_argument('--config', dest='sm_config_path', type=str, help='SM config path') parser.set_defaults( sm_config_path=path.join(proj_root(), 'conf/config.json')) args = parser.parse_args() SMConfig.set_path(args.sm_config_path) db = DB(SMConfig.get_conf()['db']) ds_config = db.select_one(DS_CONFIG_SEL, args.ds_name)[0] isotope_gen_config = ds_config['isotope_generation'] charge = '{}{}'.format(isotope_gen_config['charge']['polarity'], isotope_gen_config['charge']['n_charges']) export_rs = db.select(EXPORT_SEL, ds_config['database']['name'], args.ds_name, isotope_gen_config['isocalc_sigma'], charge, isotope_gen_config['isocalc_pts_per_mz']) header = '\t'.join(['formula_db', 'db_ids', 'sf_name', 'sf', 'adduct']) +'\t' + '\t'.join(metrics) + '\t' + \ '\t'.join(['fdr', 'isocalc_sigma', 'isocalc_charge', 'isocalc_pts_per_mz', 'first_peak_mz']) + '\n' with open(args.csv_path, 'w') as f: f.write(header) f.writelines(['\t'.join(map(str, row)) + '\n' for row in export_rs]) logger.info('Exported all search results for "%s" dataset into "%s" file', args.ds_name, args.csv_path)
class SearchJob(object): """ Main class responsible for molecule search. Uses other modules of the engine. Args ---------- ds_name : string A dataset short name """ def __init__(self, client_email, ds_name): self.sm_config = SMConfig.get_conf() self.client_email = client_email self.ds_name = ds_name self.ds_id = None self.job_id = None self.sc = None self.db = None self.ds = None self.fdr = None self.formulas = None self.ds_config = None self.wd_manager = None def _read_ds_config(self): with open(self.wd_manager.ds_config_path) as f: self.ds_config = json.load(f) def _configure_spark(self): logger.info('Configuring Spark') sconf = SparkConf() for prop, value in self.sm_config['spark'].iteritems(): if prop.startswith('spark.'): sconf.set(prop, value) if 'aws' in self.sm_config: sconf.set("spark.hadoop.fs.s3a.access.key", self.sm_config['aws']['aws_access_key_id']) sconf.set("spark.hadoop.fs.s3a.secret.key", self.sm_config['aws']['aws_secret_access_key']) sconf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") # sconf.set("spark.python.profile", "true") self.sc = SparkContext(master=self.sm_config['spark']['master'], conf=sconf, appName='SM engine') if not self.sm_config['spark']['master'].startswith('local'): self.sc.addPyFile(join(local_path(proj_root()), 'sm.zip')) def _init_db(self): logger.info('Connecting to the DB') self.db = DB(self.sm_config['db']) self.sf_db_id = self.db.select_one( DB_ID_SEL, self.ds_config['database']['name'])[0] def store_job_meta(self): """ Store search job metadata in the database """ logger.info('Storing job metadata') self.ds_id = int(self.db.select_one(DS_ID_SEL, self.ds_name)[0]) self.job_id = self.ds_id self.db.alter(DEL_JOB_SQL, self.job_id) rows = [(self.job_id, self.sf_db_id, self.ds_id, datetime.now().strftime('%Y-%m-%d %H:%M:%S'))] self.db.insert(JOB_INS, rows) rows = [(self.job_id, adduct) for adduct in self.ds_config['isotope_generation']['adducts']] self.db.insert(ADDUCT_INS, rows) def run(self, input_path, ds_config_path, clean=False): """ Entry point of the engine. Molecule search is completed in several steps: * Copying input data to the engine work dir * Conversion input data (imzML+ibd) to plain text format. One line - one spectrum data * Generation and saving to the database theoretical peaks for all formulas from the molecule database * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner. * Saving results (isotope images and their metrics of quality for each putative molecule) to the database Args ------- input_path : string Path to the dataset folder with .imzML and .ibd files ds_config_path: string Path to the dataset config file clean : bool Clean all interim data files before starting molecule search """ try: self.wd_manager = WorkDirManager(self.ds_name) if clean: self.wd_manager.clean() self.wd_manager.copy_input_data(input_path, ds_config_path) self._read_ds_config() logger.info('Dataset config:\n%s', pformat(self.ds_config)) self._configure_spark() self._init_db() if not self.wd_manager.exists(self.wd_manager.txt_path): imzml_converter = ImzmlTxtConverter( self.ds_name, self.wd_manager.local_dir.imzml_path, self.wd_manager.local_dir.txt_path, self.wd_manager.local_dir.coord_path) imzml_converter.convert() if not self.wd_manager.local_fs_only: self.wd_manager.upload_to_remote() self.ds = Dataset(self.sc, self.ds_name, self.client_email, input_path, self.ds_config, self.wd_manager, self.db) self.ds.save_ds_meta() self.store_job_meta() theor_peaks_gen = TheorPeaksGenerator(self.sc, self.sm_config, self.ds_config) theor_peaks_gen.run() target_adducts = self.ds_config['isotope_generation']['adducts'] self.fdr = FDR(self.job_id, self.sf_db_id, decoy_sample_size=20, target_adducts=target_adducts, db=self.db) self.fdr.decoy_adduct_selection() self.formulas = FormulasSegm(self.job_id, self.sf_db_id, self.ds_config, self.db) # search_alg = MSMBasicSearch(self.sc, self.ds, self.formulas, self.fdr, self.ds_config) search_alg = MSMExtraFeats(self.sc, self.ds, self.formulas, self.fdr, self.ds_config) sf_metrics_df, sf_iso_images = search_alg.search() search_results = SearchResults( self.sf_db_id, self.ds_id, self.job_id, self.ds_name, self.formulas.get_sf_adduct_peaksn(), self.db, self.sm_config, self.ds_config) search_results.sf_metrics_df = sf_metrics_df search_results.sf_iso_images = sf_iso_images search_results.metrics = search_alg.metrics search_results.nrows, search_results.ncols = self.ds.get_dims() search_results.store() es = ESExporter(self.sm_config) es.index_ds(self.db, self.ds_name, self.ds_config['database']['name']) except Exception: exc_type, exc_value, exc_traceback = sys.exc_info() logger.error('\n'.join( traceback.format_exception(exc_type, exc_value, exc_traceback))) finally: if self.sc: # self.sc.show_profiles() self.sc.stop() if self.db: self.db.close()