def reindex_all_results(conf): db = DB(conf['db']) es_exp = ESExporter(conf) es_exp.delete_index(name='sm') es_exp.create_index(name='sm') ds_db_pairs = db.select("select name, config -> 'database'::text -> 'name'::text from dataset") for ds_name, db_name in ds_db_pairs: es_exp.index_ds(db, ds_name, db_name)
def reindex_all_results(conf): db = DB(conf['db']) es_exp = ESExporter(conf) es_exp.delete_index(name='sm') es_exp.create_index(name='sm') ds_db_pairs = db.select( "select name, config -> 'database'::text -> 'name'::text from dataset") for ds_name, db_name in ds_db_pairs: es_exp.index_ds(db, ds_name, db_name)
def test_foo(sm_config): annotations = [('test_ds', 'test_db', 'H20', '+H', [], []), ('test_ds', 'test_db', 'Au', '+H', [], [])] db_mock = MagicMock(DB) db_mock.select.return_value = annotations es_exp = ESExporter(sm_config) es_exp.index_ds(db_mock, 'test_ds', 'test_db') es = Elasticsearch() d = es.get(index='sm', id='test_ds_test_db_H20_+H', doc_type='annotation', _source=True) assert d['_source'] == {'ds_name': 'test_ds', 'db_name': 'test_db', 'sf': 'H20', 'adduct': '+H', 'comp_names': '', 'comp_ids': ''} d = es.get(index='sm', id='test_ds_test_db_Au_+H', doc_type='annotation', _source=True) assert d['_source'] == {'ds_name': 'test_ds', 'db_name': 'test_db', 'sf': 'Au', 'adduct': '+H', 'comp_names': '', 'comp_ids': ''}
def test_foo(sm_config): annotations = [('test_ds', 'test_db', 'H20', '+H', [], [], 100), ('test_ds', 'test_db', 'Au', '+H', [], [], 200)] db_mock = MagicMock(DB) db_mock.select.return_value = annotations es_exp = ESExporter(sm_config) es_exp.index_ds(db_mock, 'test_ds', 'test_db') es = Elasticsearch() d = es.get(index='sm', id='test_ds_test_db_H20_+H', doc_type='annotation', _source=True) assert d['_source'] == {'ds_name': 'test_ds', 'db_name': 'test_db', 'sf': 'H20', 'adduct': '+H', 'comp_names': '', 'comp_ids': '', 'mz': '00100.0000'} d = es.get(index='sm', id='test_ds_test_db_Au_+H', doc_type='annotation', _source=True) assert d['_source'] == {'ds_name': 'test_ds', 'db_name': 'test_db', 'sf': 'Au', 'adduct': '+H', 'comp_names': '', 'comp_ids': '', 'mz': '00200.0000'}
def test_index_ds_works(es_dsl_search, sm_index, sm_config): ds_id = '2000-01-01_00h00m' upload_dt = datetime.now().isoformat(' ') mol_db_id = 0 last_finished = '2017-01-01T00:00:00' def db_sel_side_effect(sql, params): if sql == DATASET_SEL: return [{ 'ds_id': ds_id, 'ds_name': 'ds_name', 'ds_input_path': 'ds_input_path', 'ds_config': 'ds_config', 'ds_meta': {}, 'ds_upload_dt': upload_dt, 'ds_status': 'ds_status', 'ds_last_finished': datetime.strptime(last_finished, '%Y-%m-%dT%H:%M:%S'), 'ds_is_public': True, 'ds_ion_img_storage': 'fs', 'ds_acq_geometry': {} }] elif sql == ANNOTATIONS_SEL: return [{ 'sf': 'H2O', 'sf_adduct': 'H2O+H', 'chaos': 1, 'image_corr': 1, 'pattern_match': 1, 'total_iso_ints': 100, 'min_iso_ints': 0, 'max_iso_ints': 100, 'msm': 1, 'adduct': '+H', 'job_id': 1, 'fdr': 0.1, 'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'], 'polarity': '+' }, { 'sf': 'Au', 'sf_adduct': 'Au+H', 'chaos': 1, 'image_corr': 1, 'pattern_match': 1, 'total_iso_ints': 100, 'min_iso_ints': 0, 'max_iso_ints': 100, 'msm': 1, 'adduct': '+H', 'job_id': 1, 'fdr': 0.05, 'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'], 'polarity': '+' }] else: logging.getLogger('engine').error('Wrong db_sel_side_effect arguments: ', args) db_mock = MagicMock(spec=DB) db_mock.select_with_fields.side_effect = db_sel_side_effect mol_db_mock = MagicMock(MolecularDB) mol_db_mock.id = mol_db_id mol_db_mock.name = 'db_name' mol_db_mock.version = '2017' mol_db_mock.get_molecules.return_value = pd.DataFrame([('H2O', 'mol_id', 'mol_name'), ('Au', 'mol_id', 'mol_name')], columns=['sf', 'mol_id', 'mol_name']) isocalc_mock = MagicMock(IsocalcWrapper) isocalc_mock.ion_centroids = lambda sf, adduct: { ('H2O', '+H'): ([100., 200.], None), ('Au', '+H'): ([10., 20.], None) }[(sf, adduct)] es_exp = ESExporter(db_mock) es_exp.delete_ds(ds_id) es_exp.index_ds(ds_id=ds_id, mol_db=mol_db_mock, isocalc=isocalc_mock) wait_for_es(sec=1) ds_d = es_dsl_search.filter('term', _type='dataset').execute().to_dict()['hits']['hits'][0]['_source'] assert ds_d == { 'ds_last_finished': last_finished, 'ds_config': 'ds_config', 'ds_meta': {}, 'ds_status': 'ds_status', 'ds_name': 'ds_name', 'ds_input_path': 'ds_input_path', 'ds_id': ds_id, 'ds_upload_dt': upload_dt, 'annotation_counts': [{'db': {'name': 'db_name', 'version': '2017'}, 'counts': [{'level': 5, 'n': 1}, {'level': 10, 'n': 2}, {'level': 20, 'n': 2}, {'level': 50, 'n': 2}]}], 'ds_is_public': True, 'ds_acq_geometry': {}, 'ds_ion_img_storage': 'fs' } ann_1_d = es_dsl_search.filter('term', sf='H2O').execute().to_dict()['hits']['hits'][0]['_source'] assert ann_1_d == { 'pattern_match': 1, 'image_corr': 1, 'fdr': 0.1, 'chaos': 1, 'sf': 'H2O', 'min_iso_ints': 0, 'msm': 1, 'sf_adduct': 'H2O+H', 'total_iso_ints': 100, 'centroid_mzs': [100., 200.], 'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'], 'polarity': '+', 'job_id': 1, 'max_iso_ints': 100, 'adduct': '+H', 'ds_name': 'ds_name', 'annotation_counts': [], 'db_version': '2017', 'ds_status': 'ds_status', 'ion_add_pol': '[M+H]+', 'comp_names': ['mol_name'], 'db_name': 'db_name', 'mz': 100., 'ds_meta': {}, 'comp_ids': ['mol_id'], 'ds_config': 'ds_config', 'ds_input_path': 'ds_input_path', 'ds_id': ds_id, 'ds_upload_dt': upload_dt, 'ds_last_finished': last_finished, 'ds_ion_img_storage': 'fs', 'ds_is_public': True } ann_2_d = es_dsl_search.filter('term', sf='Au').execute().to_dict()['hits']['hits'][0]['_source'] assert ann_2_d == { 'pattern_match': 1, 'image_corr': 1, 'fdr': 0.05, 'chaos': 1, 'sf': 'Au', 'min_iso_ints': 0, 'msm': 1, 'sf_adduct': 'Au+H', 'total_iso_ints': 100, 'centroid_mzs': [10., 20.], 'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'], 'polarity': '+', 'job_id': 1, 'max_iso_ints': 100, 'adduct': '+H', 'ds_name': 'ds_name', 'annotation_counts': [], 'db_version': '2017', 'ds_status': 'ds_status', 'ion_add_pol': '[M+H]+', 'comp_names': ['mol_name'], 'db_name': 'db_name', 'mz': 10., 'ds_meta': {}, 'comp_ids': ['mol_id'], 'ds_config': 'ds_config', 'ds_input_path': 'ds_input_path', 'ds_id': ds_id, 'ds_upload_dt': upload_dt, 'ds_last_finished': last_finished, 'ds_ion_img_storage': 'fs', 'ds_is_public': True }
def test_sm_daemon_es_export_fails(get_compute_img_metrics_mock, filter_sf_metrics_mock, post_images_to_annot_service_mock, MolDBServiceWrapperMock, sm_config, test_db, es_dsl_search, clean_isotope_storage): init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock) get_compute_img_metrics_mock.return_value = lambda *args: (0.9, 0.9, 0.9, [100.], [0], [10.]) filter_sf_metrics_mock.side_effect = lambda x: x url_dict = { 'iso_image_ids': ['iso_image_1', None, None, None] } post_images_to_annot_service_mock.return_value = { 35: url_dict, 44: url_dict } db = DB(sm_config['db']) annotate_daemon = None update_daemon = None def throw_exception_function(*args, **kwargs): raise Exception('Test') es = ESExporter(db) es.index_ds = throw_exception_function try: ds_id = '2000-01-01_00h00m' upload_dt = datetime.now() ds_config_str = open(ds_config_path).read() db.insert(Dataset.DS_INSERT, [{ 'id': ds_id, 'name': test_ds_name, 'input_path': input_dir_path, 'upload_dt': upload_dt, 'metadata': '{}', 'config': ds_config_str, 'status': DatasetStatus.QUEUED, 'is_public': True, 'mol_dbs': ['HMDB-v4'], 'adducts': ['+H', '+Na', '+K'], 'ion_img_storage': 'fs' }]) ds = Dataset.load(db, ds_id) queue_pub.publish({'ds_id': ds.id, 'ds_name': ds.name, 'action': 'annotate'}) run_daemons(db, es) # dataset and job tables asserts row = db.select_one('SELECT status from job') assert row[0] == 'FINISHED' row = db.select_one('SELECT status from dataset') assert row[0] == 'FAILED' finally: db.close() if annotate_daemon: annotate_daemon.stop() if update_daemon: update_daemon.stop() with warn_only(): local('rm -rf {}'.format(data_dir_path))
def test_index_ds_works(sm_config, test_db, es, es_dsl_search, sm_index, ds_config, metadata, annotation_stats): ds_id = '2000-01-01_00h00m' upload_dt = datetime.now().isoformat() last_finished = '2017-01-01 00:00:00' iso_image_ids = ['iso_img_id_1', 'iso_img_id_2'] stats = json.dumps(annotation_stats) db = DB() db.insert( "INSERT INTO dataset(id, name, input_path, config, metadata, upload_dt, status, " "status_update_dt, is_public, acq_geometry, ion_thumbnail) " "VALUES (%s, 'ds_name', 'ds_input_path', %s, %s, %s, 'ds_status', %s, true, '{}', %s)", [[ ds_id, json.dumps(ds_config), json.dumps(metadata), upload_dt, upload_dt, 'thumb-id' ]], ) moldb = create_test_molecular_db() (job_id, ) = db.insert_return( "INSERT INTO job(ds_id, moldb_id, status, start, finish) " "VALUES (%s, %s, 'job_status', %s, %s) RETURNING id", rows=[(ds_id, moldb.id, last_finished, last_finished)], ) (user_id, ) = db.insert_return( "INSERT INTO graphql.user (email, name, role) " "VALUES ('email', 'user_name', 'user') RETURNING id", [[]], ) (group_id, ) = db.insert_return( "INSERT INTO graphql.group (name, short_name) VALUES ('group name', 'grp') RETURNING id", [[]], ) db.insert( "INSERT INTO graphql.dataset(id, user_id, group_id) VALUES (%s, %s, %s)", [[ds_id, user_id, group_id]], ) ion_id1, ion_id2 = db.insert_return( "INSERT INTO graphql.ion(ion, formula, chem_mod, neutral_loss, adduct, charge, ion_formula) " "VALUES (%s, %s, %s, %s, %s, %s, %s) RETURNING id", [ ['H2O-H+O-H+H', 'H2O', '-H+O', '-H', '+H', 1, 'HO2'], ['Au+H', 'Au', '', '', '+H', 1, 'HAu'], ], ) db.insert( "INSERT INTO annotation(job_id, formula, chem_mod, neutral_loss, adduct, " "msm, fdr, stats, iso_image_ids, ion_id) " "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", [ [ job_id, 'H2O', '-H+O', '-H', '+H', 1, 0.1, stats, iso_image_ids, ion_id1 ], [ job_id, 'Au', '', '', '+H', 1, 0.05, stats, iso_image_ids, ion_id2 ], ], ) isocalc_mock = MagicMock(IsocalcWrapper) isocalc_mock.centroids = lambda formula: { 'H2O+H': ([100.0, 200.0], None), 'H2O-H+O-H+H': ([100.0, 200.0, 300.0], None), 'Au+H': ([10.0, 20.0], None), }[formula] isocalc_mock.mass_accuracy_bounds = lambda mzs: (mzs, mzs) with patch( 'sm.engine.es_export.molecular_db.fetch_molecules', return_value=pd.DataFrame( [('H2O', 'mol_id', 'mol_name'), ('Au', 'mol_id', 'mol_name')], columns=['formula', 'mol_id', 'mol_name'], ), ): es_exp = ESExporter(db, sm_config) es_exp.delete_ds(ds_id) es_exp.index_ds( ds_id=ds_id, moldb=moldb, isocalc=isocalc_mock, ) wait_for_es(es, sm_config['elasticsearch']['index']) ds_d = (es_dsl_search.filter( 'term', _type='dataset').execute().to_dict()['hits']['hits'][0]['_source']) expected_ds_fields = { 'ds_last_finished': last_finished, 'ds_config': ds_config, 'ds_adducts': ds_config['isotope_generation']['adducts'], 'ds_moldb_ids': ds_config['database_ids'], 'ds_chem_mods': [], 'ds_neutral_losses': [], 'ds_project_ids': [], 'ds_project_names': [], 'ds_meta': metadata, 'ds_status': 'ds_status', 'ds_status_update_dt': upload_dt, 'ds_name': 'ds_name', 'ds_input_path': 'ds_input_path', 'ds_id': ds_id, 'ds_upload_dt': upload_dt, 'ds_is_public': True, 'ds_submitter_email': 'email', 'ds_submitter_id': user_id, 'ds_submitter_name': 'user_name', 'ds_group_approved': False, 'ds_group_id': group_id, 'ds_group_name': 'group name', 'ds_group_short_name': 'grp', } assert ds_d == { **expected_ds_fields, 'ds_acq_geometry': {}, 'annotation_counts': [{ 'db': { 'id': moldb.id, 'name': moldb.name }, 'counts': [ { 'level': 5, 'n': 1 }, { 'level': 10, 'n': 2 }, { 'level': 20, 'n': 2 }, { 'level': 50, 'n': 2 }, ], }], } ann_1_d = (es_dsl_search.filter( 'term', formula='H2O').execute().to_dict()['hits']['hits'][0]['_source']) top_level_stats = { 'pattern_match': annotation_stats['spectral'], 'image_corr': annotation_stats['spatial'], 'chaos': annotation_stats['chaos'], **{ key: value for key, value in annotation_stats.items() if key in NON_METRIC_STATS }, } metrics = { key: value for key, value in annotation_stats.items() if key not in NON_METRIC_STATS } assert ann_1_d == { **expected_ds_fields, **top_level_stats, 'metrics': metrics, 'fdr': 0.1, 'formula': 'H2O', 'msm': 1.0, 'ion': 'H2O-H+O-H+H+', 'ion_formula': 'HO2', 'centroid_mzs': [100.0, 200.0, 300.0], 'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'], 'iso_image_urls': [ f'http://localhost:9000/{sm_config["image_storage"]["bucket"]}/iso/{ds_id}/iso_img_id_1', f'http://localhost:9000/{sm_config["image_storage"]["bucket"]}/iso/{ds_id}/iso_img_id_2', ], 'isobars': [], 'isomer_ions': [], 'polarity': '+', 'job_id': 1, 'adduct': '+H', 'neutral_loss': '-H', 'chem_mod': '-H+O', 'annotation_counts': [], 'comp_names': ['mol_name'], 'comps_count_with_isomers': 1, 'db_id': moldb.id, 'db_name': moldb.name, 'db_version': moldb.version, 'mz': 100.0, 'comp_ids': ['mol_id'], 'annotation_id': 1, 'off_sample_label': None, 'off_sample_prob': None, } ann_2_d = (es_dsl_search.filter( 'term', formula='Au').execute().to_dict()['hits']['hits'][0]['_source']) assert ann_2_d == { **expected_ds_fields, **top_level_stats, 'metrics': metrics, 'fdr': 0.05, 'formula': 'Au', 'msm': 1.0, 'ion': 'Au+H+', 'ion_formula': 'HAu', 'centroid_mzs': [10.0, 20.0], 'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'], 'iso_image_urls': [ f'http://localhost:9000/{sm_config["image_storage"]["bucket"]}/iso/{ds_id}/iso_img_id_1', f'http://localhost:9000/{sm_config["image_storage"]["bucket"]}/iso/{ds_id}/iso_img_id_2', ], 'isobars': [], 'isomer_ions': [], 'polarity': '+', 'job_id': 1, 'adduct': '+H', 'neutral_loss': '', 'chem_mod': '', 'annotation_counts': [], 'comp_names': ['mol_name'], 'comps_count_with_isomers': 1, 'db_id': moldb.id, 'db_name': moldb.name, 'db_version': moldb.version, 'mz': 10.0, 'comp_ids': ['mol_id'], 'annotation_id': 2, 'off_sample_label': None, 'off_sample_prob': None, }
def test_sm_daemon_es_export_fails( MSMSearchMock, post_images_to_image_store_mock, test_db, es_dsl_search, clean_isotope_storage, reset_queues, metadata, ds_config, queue_pub, local_sm_config, ): moldb = init_moldb() formula_metrics_df = pd.DataFrame({ 'formula_i': [0, 1, 2], 'ion_formula': ['C12H24O-H2O+H', 'C12H24O-H2+O2-CO+Na', 'C12H24O+K'], 'formula': ['C12H24O', 'C12H24O', 'C12H24O'], 'modifier': ['-H2O+H', '-H2+O2-CO+Na', '+K'], 'chem_mod': ['', '-H2+O2', ''], 'neutral_loss': ['-H2O', '-CO', ''], 'adduct': ['+H', '+Na', '+K'], 'chaos': [0.9, 0.9, 0.9], 'spatial': [0.9, 0.9, 0.9], 'spectral': [0.9, 0.9, 0.9], 'msm': [0.9**3, 0.9**3, 0.9**3], 'total_iso_ints': [[100.0], [100.0], [100.0]], 'min_iso_ints': [[0], [0], [0]], 'max_iso_ints': [[10.0], [10.0], [10.0]], 'fdr': [0.1, 0.1, 0.1], }).set_index('formula_i') search_algo_mock = MSMSearchMock() search_algo_mock.search.return_value = [ (formula_metrics_df, [], create_test_fdr_diagnostics_bundle()) ] search_algo_mock.metrics = OrderedDict([ ('chaos', 0), ('spatial', 0), ('spectral', 0), ('msm', 0), ('total_iso_ints', []), ('min_iso_ints', []), ('max_iso_ints', []), ]) image_ids = ['iso_image_1', None, None, None] post_images_to_image_store_mock.return_value = { 0: image_ids, 1: image_ids, 2: image_ids } db = DB() def throw_exception_function(*args, **kwargs): raise Exception('Test') es = ESExporter(db, local_sm_config) es.index_ds = throw_exception_function ds = create_test_ds( name=test_ds_name, input_path=input_dir_path, config={ **ds_config, 'database_ids': [moldb.id] }, status=DatasetStatus.QUEUED, es=es, ) queue_pub.publish({ 'ds_id': ds.id, 'ds_name': test_ds_name, 'action': DaemonAction.ANNOTATE }) run_daemons(db, es, local_sm_config) # dataset and job tables asserts row = db.select_one('SELECT status from job') assert row[0] == 'FINISHED' row = db.select_one('SELECT status from dataset') assert row[0] == 'FAILED'
def run(self, input_path, ds_config_path, clean=False): """ Entry point of the engine. Molecule search is completed in several steps: * Copying input data to the engine work dir * Conversion input data (imzML+ibd) to plain text format. One line - one spectrum data * Generation and saving to the database theoretical peaks for all formulas from the molecule database * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner. * Saving results (isotope images and their metrics of quality for each putative molecule) to the database Args ------- input_path : string Path to the dataset folder with .imzML and .ibd files ds_config_path: string Path to the dataset config file clean : bool Clean all interim data files before starting molecule search """ try: self.wd_manager = WorkDirManager(self.ds_name) if clean: self.wd_manager.clean() self.wd_manager.copy_input_data(input_path, ds_config_path) self._read_ds_config() logger.info('Dataset config:\n%s', pformat(self.ds_config)) self._configure_spark() self._init_db() if not self.wd_manager.exists(self.wd_manager.txt_path): imzml_converter = ImzmlTxtConverter( self.ds_name, self.wd_manager.local_dir.imzml_path, self.wd_manager.local_dir.txt_path, self.wd_manager.local_dir.coord_path) imzml_converter.convert() if not self.wd_manager.local_fs_only: self.wd_manager.upload_to_remote() self.ds = Dataset(self.sc, self.ds_name, self.client_email, input_path, self.ds_config, self.wd_manager, self.db) self.ds.save_ds_meta() self.store_job_meta() theor_peaks_gen = TheorPeaksGenerator(self.sc, self.sm_config, self.ds_config) theor_peaks_gen.run() target_adducts = self.ds_config['isotope_generation']['adducts'] self.fdr = FDR(self.job_id, self.sf_db_id, decoy_sample_size=20, target_adducts=target_adducts, db=self.db) self.fdr.decoy_adduct_selection() self.formulas = FormulasSegm(self.job_id, self.sf_db_id, self.ds_config, self.db) # search_alg = MSMBasicSearch(self.sc, self.ds, self.formulas, self.fdr, self.ds_config) search_alg = MSMExtraFeats(self.sc, self.ds, self.formulas, self.fdr, self.ds_config) sf_metrics_df, sf_iso_images = search_alg.search() search_results = SearchResults( self.sf_db_id, self.ds_id, self.job_id, self.ds_name, self.formulas.get_sf_adduct_peaksn(), self.db, self.sm_config, self.ds_config) search_results.sf_metrics_df = sf_metrics_df search_results.sf_iso_images = sf_iso_images search_results.metrics = search_alg.metrics search_results.nrows, search_results.ncols = self.ds.get_dims() search_results.store() es = ESExporter(self.sm_config) es.index_ds(self.db, self.ds_name, self.ds_config['database']['name']) except Exception: exc_type, exc_value, exc_traceback = sys.exc_info() logger.error('\n'.join( traceback.format_exception(exc_type, exc_value, exc_traceback))) finally: if self.sc: # self.sc.show_profiles() self.sc.stop() if self.db: self.db.close()