Пример #1
0
def reindex_all_results(conf):
    db = DB(conf['db'])
    es_exp = ESExporter(conf)

    es_exp.delete_index(name='sm')
    es_exp.create_index(name='sm')

    ds_db_pairs = db.select("select name, config -> 'database'::text -> 'name'::text from dataset")

    for ds_name, db_name in ds_db_pairs:
        es_exp.index_ds(db, ds_name, db_name)
Пример #2
0
def reindex_all_results(conf):
    db = DB(conf['db'])
    es_exp = ESExporter(conf)

    es_exp.delete_index(name='sm')
    es_exp.create_index(name='sm')

    ds_db_pairs = db.select(
        "select name, config -> 'database'::text -> 'name'::text from dataset")

    for ds_name, db_name in ds_db_pairs:
        es_exp.index_ds(db, ds_name, db_name)
Пример #3
0
def test_foo(sm_config):
    annotations = [('test_ds', 'test_db', 'H20', '+H', [], []), ('test_ds', 'test_db', 'Au', '+H', [], [])]
    db_mock = MagicMock(DB)
    db_mock.select.return_value = annotations

    es_exp = ESExporter(sm_config)
    es_exp.index_ds(db_mock, 'test_ds', 'test_db')

    es = Elasticsearch()

    d = es.get(index='sm', id='test_ds_test_db_H20_+H', doc_type='annotation', _source=True)
    assert d['_source'] == {'ds_name': 'test_ds', 'db_name': 'test_db', 'sf': 'H20', 'adduct': '+H', 'comp_names': '', 'comp_ids': ''}

    d = es.get(index='sm', id='test_ds_test_db_Au_+H', doc_type='annotation', _source=True)
    assert d['_source'] == {'ds_name': 'test_ds', 'db_name': 'test_db', 'sf': 'Au', 'adduct': '+H', 'comp_names': '', 'comp_ids': ''}
Пример #4
0
def test_foo(sm_config):
    annotations = [('test_ds', 'test_db', 'H20', '+H', [], [], 100), ('test_ds', 'test_db', 'Au', '+H', [], [], 200)]
    db_mock = MagicMock(DB)
    db_mock.select.return_value = annotations

    es_exp = ESExporter(sm_config)
    es_exp.index_ds(db_mock, 'test_ds', 'test_db')

    es = Elasticsearch()

    d = es.get(index='sm', id='test_ds_test_db_H20_+H', doc_type='annotation', _source=True)
    assert d['_source'] == {'ds_name': 'test_ds', 'db_name': 'test_db', 'sf': 'H20', 'adduct': '+H',
                            'comp_names': '', 'comp_ids': '', 'mz': '00100.0000'}

    d = es.get(index='sm', id='test_ds_test_db_Au_+H', doc_type='annotation', _source=True)
    assert d['_source'] == {'ds_name': 'test_ds', 'db_name': 'test_db', 'sf': 'Au', 'adduct': '+H',
                            'comp_names': '', 'comp_ids': '', 'mz': '00200.0000'}
Пример #5
0
def test_index_ds_works(es_dsl_search, sm_index, sm_config):
    ds_id = '2000-01-01_00h00m'
    upload_dt = datetime.now().isoformat(' ')
    mol_db_id = 0
    last_finished = '2017-01-01T00:00:00'

    def db_sel_side_effect(sql, params):
        if sql == DATASET_SEL:
            return [{
                'ds_id': ds_id,
                'ds_name': 'ds_name',
                'ds_input_path': 'ds_input_path',
                'ds_config': 'ds_config',
                'ds_meta': {},
                'ds_upload_dt': upload_dt,
                'ds_status': 'ds_status',
                'ds_last_finished': datetime.strptime(last_finished, '%Y-%m-%dT%H:%M:%S'),
                'ds_is_public': True,
                'ds_ion_img_storage': 'fs',
                'ds_acq_geometry': {}
            }]
        elif sql == ANNOTATIONS_SEL:
            return [{
                'sf': 'H2O',
                'sf_adduct': 'H2O+H',
                'chaos': 1,
                'image_corr': 1,
                'pattern_match': 1,
                'total_iso_ints': 100,
                'min_iso_ints': 0,
                'max_iso_ints': 100,
                'msm': 1,
                'adduct': '+H',
                'job_id': 1,
                'fdr': 0.1,
                'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'],
                'polarity': '+'
            }, {
                'sf': 'Au',
                'sf_adduct': 'Au+H',
                'chaos': 1,
                'image_corr': 1,
                'pattern_match': 1,
                'total_iso_ints': 100,
                'min_iso_ints': 0,
                'max_iso_ints': 100,
                'msm': 1,
                'adduct': '+H',
                'job_id': 1,
                'fdr': 0.05,
                'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'],
                'polarity': '+'
            }]
        else:
            logging.getLogger('engine').error('Wrong db_sel_side_effect arguments: ', args)

    db_mock = MagicMock(spec=DB)
    db_mock.select_with_fields.side_effect = db_sel_side_effect

    mol_db_mock = MagicMock(MolecularDB)
    mol_db_mock.id = mol_db_id
    mol_db_mock.name = 'db_name'
    mol_db_mock.version = '2017'
    mol_db_mock.get_molecules.return_value = pd.DataFrame([('H2O', 'mol_id', 'mol_name'), ('Au', 'mol_id', 'mol_name')],
                                                          columns=['sf', 'mol_id', 'mol_name'])

    isocalc_mock = MagicMock(IsocalcWrapper)
    isocalc_mock.ion_centroids = lambda sf, adduct: {
        ('H2O', '+H'): ([100., 200.], None),
        ('Au', '+H'): ([10., 20.], None)
    }[(sf, adduct)]

    es_exp = ESExporter(db_mock)
    es_exp.delete_ds(ds_id)
    es_exp.index_ds(ds_id=ds_id, mol_db=mol_db_mock, isocalc=isocalc_mock)

    wait_for_es(sec=1)

    ds_d = es_dsl_search.filter('term', _type='dataset').execute().to_dict()['hits']['hits'][0]['_source']
    assert ds_d == {
        'ds_last_finished': last_finished, 'ds_config': 'ds_config', 'ds_meta': {},
        'ds_status': 'ds_status', 'ds_name': 'ds_name', 'ds_input_path': 'ds_input_path', 'ds_id': ds_id,
        'ds_upload_dt': upload_dt,
        'annotation_counts': [{'db': {'name': 'db_name', 'version': '2017'},
                               'counts': [{'level': 5, 'n': 1}, {'level': 10, 'n': 2},
                                          {'level': 20, 'n': 2}, {'level': 50, 'n': 2}]}],
        'ds_is_public': True,
        'ds_acq_geometry': {},
        'ds_ion_img_storage': 'fs'
    }
    ann_1_d = es_dsl_search.filter('term', sf='H2O').execute().to_dict()['hits']['hits'][0]['_source']
    assert ann_1_d == {
        'pattern_match': 1, 'image_corr': 1, 'fdr': 0.1, 'chaos': 1, 'sf': 'H2O', 'min_iso_ints': 0,
        'msm': 1, 'sf_adduct': 'H2O+H', 'total_iso_ints': 100, 'centroid_mzs': [100., 200.],
        'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'], 'polarity': '+', 'job_id': 1, 'max_iso_ints': 100,
        'adduct': '+H', 'ds_name': 'ds_name', 'annotation_counts': [], 'db_version': '2017', 'ds_status': 'ds_status',
        'ion_add_pol': '[M+H]+', 'comp_names': ['mol_name'], 'db_name': 'db_name', 'mz': 100., 'ds_meta': {},
        'comp_ids': ['mol_id'], 'ds_config': 'ds_config', 'ds_input_path': 'ds_input_path', 'ds_id': ds_id,
        'ds_upload_dt': upload_dt, 'ds_last_finished': last_finished,
        'ds_ion_img_storage': 'fs', 'ds_is_public': True
    }
    ann_2_d = es_dsl_search.filter('term', sf='Au').execute().to_dict()['hits']['hits'][0]['_source']
    assert ann_2_d == {
        'pattern_match': 1, 'image_corr': 1, 'fdr': 0.05, 'chaos': 1, 'sf': 'Au', 'min_iso_ints': 0,
        'msm': 1, 'sf_adduct': 'Au+H', 'total_iso_ints': 100, 'centroid_mzs': [10., 20.],
        'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'], 'polarity': '+', 'job_id': 1, 'max_iso_ints': 100,
        'adduct': '+H',  'ds_name': 'ds_name', 'annotation_counts': [], 'db_version': '2017', 'ds_status': 'ds_status',
        'ion_add_pol': '[M+H]+', 'comp_names': ['mol_name'], 'db_name': 'db_name', 'mz': 10., 'ds_meta': {},
        'comp_ids': ['mol_id'], 'ds_config': 'ds_config', 'ds_input_path': 'ds_input_path', 'ds_id': ds_id,
        'ds_upload_dt': upload_dt, 'ds_last_finished': last_finished,
        'ds_ion_img_storage': 'fs', 'ds_is_public': True
    }
Пример #6
0
def test_sm_daemon_es_export_fails(get_compute_img_metrics_mock, filter_sf_metrics_mock,
                                   post_images_to_annot_service_mock,
                                   MolDBServiceWrapperMock,
                                   sm_config, test_db, es_dsl_search,
                                   clean_isotope_storage):
    init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock)

    get_compute_img_metrics_mock.return_value = lambda *args: (0.9, 0.9, 0.9, [100.], [0], [10.])
    filter_sf_metrics_mock.side_effect = lambda x: x

    url_dict = {
        'iso_image_ids': ['iso_image_1', None, None, None]
    }
    post_images_to_annot_service_mock.return_value = {
        35: url_dict,
        44: url_dict
    }

    db = DB(sm_config['db'])
    annotate_daemon = None
    update_daemon = None

    def throw_exception_function(*args, **kwargs):
        raise Exception('Test')

    es = ESExporter(db)
    es.index_ds = throw_exception_function

    try:
        ds_id = '2000-01-01_00h00m'
        upload_dt = datetime.now()
        ds_config_str = open(ds_config_path).read()
        db.insert(Dataset.DS_INSERT, [{
            'id': ds_id,
            'name': test_ds_name,
            'input_path': input_dir_path,
            'upload_dt': upload_dt,
            'metadata': '{}',
            'config': ds_config_str,
            'status': DatasetStatus.QUEUED,
            'is_public': True,
            'mol_dbs': ['HMDB-v4'],
            'adducts': ['+H', '+Na', '+K'],
            'ion_img_storage': 'fs'
        }])

        ds = Dataset.load(db, ds_id)
        queue_pub.publish({'ds_id': ds.id, 'ds_name': ds.name, 'action': 'annotate'})

        run_daemons(db, es)

        # dataset and job tables asserts
        row = db.select_one('SELECT status from job')
        assert row[0] == 'FINISHED'
        row = db.select_one('SELECT status from dataset')
        assert row[0] == 'FAILED'
    finally:
        db.close()
        if annotate_daemon:
            annotate_daemon.stop()
        if update_daemon:
            update_daemon.stop()
        with warn_only():
            local('rm -rf {}'.format(data_dir_path))
def test_index_ds_works(es_dsl_search, sm_index, sm_config):
    ds_id = '2000-01-01_00h00m'
    upload_dt = datetime.now().isoformat(' ')
    mol_db_id = 0
    last_finished = '2017-01-01T00:00:00'

    def db_sel_side_effect(sql, params):
        if sql == DATASET_SEL:
            return [{
                'ds_id': ds_id,
                'ds_name': 'ds_name',
                'ds_input_path': 'ds_input_path',
                'ds_config': 'ds_config',
                'ds_meta': {},
                'ds_upload_dt': upload_dt,
                'ds_status': 'ds_status',
                'ds_last_finished': datetime.strptime(last_finished, '%Y-%m-%dT%H:%M:%S'),
                'ds_is_public': True,
                'ds_ion_img_storage': 'fs',
                'ds_acq_geometry': {}
            }]
        elif sql == ANNOTATIONS_SEL:
            return [{
                'sf': 'H2O',
                'sf_adduct': 'H2O+H',
                'chaos': 1,
                'image_corr': 1,
                'pattern_match': 1,
                'total_iso_ints': 100,
                'min_iso_ints': 0,
                'max_iso_ints': 100,
                'msm': 1,
                'adduct': '+H',
                'job_id': 1,
                'fdr': 0.1,
                'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'],
                'polarity': '+'
            }, {
                'sf': 'Au',
                'sf_adduct': 'Au+H',
                'chaos': 1,
                'image_corr': 1,
                'pattern_match': 1,
                'total_iso_ints': 100,
                'min_iso_ints': 0,
                'max_iso_ints': 100,
                'msm': 1,
                'adduct': '+H',
                'job_id': 1,
                'fdr': 0.05,
                'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'],
                'polarity': '+'
            }]
        else:
            logging.getLogger('engine').error('Wrong db_sel_side_effect arguments: ', args)

    db_mock = MagicMock(spec=DB)
    db_mock.select_with_fields.side_effect = db_sel_side_effect

    mol_db_mock = MagicMock(MolecularDB)
    mol_db_mock.id = mol_db_id
    mol_db_mock.name = 'db_name'
    mol_db_mock.version = '2017'
    mol_db_mock.get_molecules.return_value = pd.DataFrame([('H2O', 'mol_id', 'mol_name'), ('Au', 'mol_id', 'mol_name')],
                                                          columns=['sf', 'mol_id', 'mol_name'])

    isocalc_mock = MagicMock(IsocalcWrapper)
    isocalc_mock.ion_centroids = lambda sf, adduct: {
        ('H2O', '+H'): ([100., 200.], None),
        ('Au', '+H'): ([10., 20.], None)
    }[(sf, adduct)]

    es_exp = ESExporter(db_mock)
    es_exp.delete_ds(ds_id)
    es_exp.index_ds(ds_id=ds_id, mol_db=mol_db_mock, isocalc=isocalc_mock)

    wait_for_es(sec=1)

    ds_d = es_dsl_search.filter('term', _type='dataset').execute().to_dict()['hits']['hits'][0]['_source']
    assert ds_d == {
        'ds_last_finished': last_finished, 'ds_config': 'ds_config', 'ds_meta': {},
        'ds_status': 'ds_status', 'ds_name': 'ds_name', 'ds_input_path': 'ds_input_path', 'ds_id': ds_id,
        'ds_upload_dt': upload_dt,
        'annotation_counts': [{'db': {'name': 'db_name', 'version': '2017'},
                               'counts': [{'level': 5, 'n': 1}, {'level': 10, 'n': 2},
                                          {'level': 20, 'n': 2}, {'level': 50, 'n': 2}]}],
        'ds_is_public': True,
        'ds_acq_geometry': {},
        'ds_ion_img_storage': 'fs'
    }
    ann_1_d = es_dsl_search.filter('term', sf='H2O').execute().to_dict()['hits']['hits'][0]['_source']
    assert ann_1_d == {
        'pattern_match': 1, 'image_corr': 1, 'fdr': 0.1, 'chaos': 1, 'sf': 'H2O', 'min_iso_ints': 0,
        'msm': 1, 'sf_adduct': 'H2O+H', 'total_iso_ints': 100, 'centroid_mzs': [100., 200.],
        'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'], 'polarity': '+', 'job_id': 1, 'max_iso_ints': 100,
        'adduct': '+H', 'ds_name': 'ds_name', 'annotation_counts': [], 'db_version': '2017', 'ds_status': 'ds_status',
        'ion_add_pol': '[M+H]+', 'comp_names': ['mol_name'], 'db_name': 'db_name', 'mz': 100., 'ds_meta': {},
        'comp_ids': ['mol_id'], 'ds_config': 'ds_config', 'ds_input_path': 'ds_input_path', 'ds_id': ds_id,
        'ds_upload_dt': upload_dt, 'ds_last_finished': last_finished,
        'ds_ion_img_storage': 'fs', 'ds_is_public': True
    }
    ann_2_d = es_dsl_search.filter('term', sf='Au').execute().to_dict()['hits']['hits'][0]['_source']
    assert ann_2_d == {
        'pattern_match': 1, 'image_corr': 1, 'fdr': 0.05, 'chaos': 1, 'sf': 'Au', 'min_iso_ints': 0,
        'msm': 1, 'sf_adduct': 'Au+H', 'total_iso_ints': 100, 'centroid_mzs': [10., 20.],
        'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'], 'polarity': '+', 'job_id': 1, 'max_iso_ints': 100,
        'adduct': '+H',  'ds_name': 'ds_name', 'annotation_counts': [], 'db_version': '2017', 'ds_status': 'ds_status',
        'ion_add_pol': '[M+H]+', 'comp_names': ['mol_name'], 'db_name': 'db_name', 'mz': 10., 'ds_meta': {},
        'comp_ids': ['mol_id'], 'ds_config': 'ds_config', 'ds_input_path': 'ds_input_path', 'ds_id': ds_id,
        'ds_upload_dt': upload_dt, 'ds_last_finished': last_finished,
        'ds_ion_img_storage': 'fs', 'ds_is_public': True
    }
Пример #8
0
def test_index_ds_works(sm_config, test_db, es, es_dsl_search, sm_index,
                        ds_config, metadata, annotation_stats):
    ds_id = '2000-01-01_00h00m'
    upload_dt = datetime.now().isoformat()
    last_finished = '2017-01-01 00:00:00'
    iso_image_ids = ['iso_img_id_1', 'iso_img_id_2']
    stats = json.dumps(annotation_stats)

    db = DB()
    db.insert(
        "INSERT INTO dataset(id, name, input_path, config, metadata, upload_dt, status, "
        "status_update_dt, is_public, acq_geometry, ion_thumbnail) "
        "VALUES (%s, 'ds_name', 'ds_input_path', %s, %s, %s, 'ds_status', %s, true, '{}', %s)",
        [[
            ds_id,
            json.dumps(ds_config),
            json.dumps(metadata), upload_dt, upload_dt, 'thumb-id'
        ]],
    )
    moldb = create_test_molecular_db()
    (job_id, ) = db.insert_return(
        "INSERT INTO job(ds_id, moldb_id, status, start, finish) "
        "VALUES (%s, %s, 'job_status', %s, %s) RETURNING id",
        rows=[(ds_id, moldb.id, last_finished, last_finished)],
    )
    (user_id, ) = db.insert_return(
        "INSERT INTO graphql.user (email, name, role) "
        "VALUES ('email', 'user_name', 'user') RETURNING id",
        [[]],
    )
    (group_id, ) = db.insert_return(
        "INSERT INTO graphql.group (name, short_name) VALUES ('group name', 'grp') RETURNING id",
        [[]],
    )
    db.insert(
        "INSERT INTO graphql.dataset(id, user_id, group_id) VALUES (%s, %s, %s)",
        [[ds_id, user_id, group_id]],
    )
    ion_id1, ion_id2 = db.insert_return(
        "INSERT INTO graphql.ion(ion, formula, chem_mod, neutral_loss, adduct, charge, ion_formula) "
        "VALUES (%s, %s, %s, %s, %s, %s, %s) RETURNING id",
        [
            ['H2O-H+O-H+H', 'H2O', '-H+O', '-H', '+H', 1, 'HO2'],
            ['Au+H', 'Au', '', '', '+H', 1, 'HAu'],
        ],
    )
    db.insert(
        "INSERT INTO annotation(job_id, formula, chem_mod, neutral_loss, adduct, "
        "msm, fdr, stats, iso_image_ids, ion_id) "
        "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
        [
            [
                job_id, 'H2O', '-H+O', '-H', '+H', 1, 0.1, stats,
                iso_image_ids, ion_id1
            ],
            [
                job_id, 'Au', '', '', '+H', 1, 0.05, stats, iso_image_ids,
                ion_id2
            ],
        ],
    )

    isocalc_mock = MagicMock(IsocalcWrapper)
    isocalc_mock.centroids = lambda formula: {
        'H2O+H': ([100.0, 200.0], None),
        'H2O-H+O-H+H': ([100.0, 200.0, 300.0], None),
        'Au+H': ([10.0, 20.0], None),
    }[formula]
    isocalc_mock.mass_accuracy_bounds = lambda mzs: (mzs, mzs)

    with patch(
            'sm.engine.es_export.molecular_db.fetch_molecules',
            return_value=pd.DataFrame(
                [('H2O', 'mol_id', 'mol_name'), ('Au', 'mol_id', 'mol_name')],
                columns=['formula', 'mol_id', 'mol_name'],
            ),
    ):
        es_exp = ESExporter(db, sm_config)
        es_exp.delete_ds(ds_id)
        es_exp.index_ds(
            ds_id=ds_id,
            moldb=moldb,
            isocalc=isocalc_mock,
        )

    wait_for_es(es, sm_config['elasticsearch']['index'])

    ds_d = (es_dsl_search.filter(
        'term',
        _type='dataset').execute().to_dict()['hits']['hits'][0]['_source'])
    expected_ds_fields = {
        'ds_last_finished': last_finished,
        'ds_config': ds_config,
        'ds_adducts': ds_config['isotope_generation']['adducts'],
        'ds_moldb_ids': ds_config['database_ids'],
        'ds_chem_mods': [],
        'ds_neutral_losses': [],
        'ds_project_ids': [],
        'ds_project_names': [],
        'ds_meta': metadata,
        'ds_status': 'ds_status',
        'ds_status_update_dt': upload_dt,
        'ds_name': 'ds_name',
        'ds_input_path': 'ds_input_path',
        'ds_id': ds_id,
        'ds_upload_dt': upload_dt,
        'ds_is_public': True,
        'ds_submitter_email': 'email',
        'ds_submitter_id': user_id,
        'ds_submitter_name': 'user_name',
        'ds_group_approved': False,
        'ds_group_id': group_id,
        'ds_group_name': 'group name',
        'ds_group_short_name': 'grp',
    }
    assert ds_d == {
        **expected_ds_fields,
        'ds_acq_geometry': {},
        'annotation_counts': [{
            'db': {
                'id': moldb.id,
                'name': moldb.name
            },
            'counts': [
                {
                    'level': 5,
                    'n': 1
                },
                {
                    'level': 10,
                    'n': 2
                },
                {
                    'level': 20,
                    'n': 2
                },
                {
                    'level': 50,
                    'n': 2
                },
            ],
        }],
    }
    ann_1_d = (es_dsl_search.filter(
        'term',
        formula='H2O').execute().to_dict()['hits']['hits'][0]['_source'])
    top_level_stats = {
        'pattern_match': annotation_stats['spectral'],
        'image_corr': annotation_stats['spatial'],
        'chaos': annotation_stats['chaos'],
        **{
            key: value
            for key, value in annotation_stats.items() if key in NON_METRIC_STATS
        },
    }
    metrics = {
        key: value
        for key, value in annotation_stats.items()
        if key not in NON_METRIC_STATS
    }
    assert ann_1_d == {
        **expected_ds_fields,
        **top_level_stats,
        'metrics':
        metrics,
        'fdr':
        0.1,
        'formula':
        'H2O',
        'msm':
        1.0,
        'ion':
        'H2O-H+O-H+H+',
        'ion_formula':
        'HO2',
        'centroid_mzs': [100.0, 200.0, 300.0],
        'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'],
        'iso_image_urls': [
            f'http://localhost:9000/{sm_config["image_storage"]["bucket"]}/iso/{ds_id}/iso_img_id_1',
            f'http://localhost:9000/{sm_config["image_storage"]["bucket"]}/iso/{ds_id}/iso_img_id_2',
        ],
        'isobars': [],
        'isomer_ions': [],
        'polarity':
        '+',
        'job_id':
        1,
        'adduct':
        '+H',
        'neutral_loss':
        '-H',
        'chem_mod':
        '-H+O',
        'annotation_counts': [],
        'comp_names': ['mol_name'],
        'comps_count_with_isomers':
        1,
        'db_id':
        moldb.id,
        'db_name':
        moldb.name,
        'db_version':
        moldb.version,
        'mz':
        100.0,
        'comp_ids': ['mol_id'],
        'annotation_id':
        1,
        'off_sample_label':
        None,
        'off_sample_prob':
        None,
    }
    ann_2_d = (es_dsl_search.filter(
        'term',
        formula='Au').execute().to_dict()['hits']['hits'][0]['_source'])
    assert ann_2_d == {
        **expected_ds_fields,
        **top_level_stats,
        'metrics':
        metrics,
        'fdr':
        0.05,
        'formula':
        'Au',
        'msm':
        1.0,
        'ion':
        'Au+H+',
        'ion_formula':
        'HAu',
        'centroid_mzs': [10.0, 20.0],
        'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'],
        'iso_image_urls': [
            f'http://localhost:9000/{sm_config["image_storage"]["bucket"]}/iso/{ds_id}/iso_img_id_1',
            f'http://localhost:9000/{sm_config["image_storage"]["bucket"]}/iso/{ds_id}/iso_img_id_2',
        ],
        'isobars': [],
        'isomer_ions': [],
        'polarity':
        '+',
        'job_id':
        1,
        'adduct':
        '+H',
        'neutral_loss':
        '',
        'chem_mod':
        '',
        'annotation_counts': [],
        'comp_names': ['mol_name'],
        'comps_count_with_isomers':
        1,
        'db_id':
        moldb.id,
        'db_name':
        moldb.name,
        'db_version':
        moldb.version,
        'mz':
        10.0,
        'comp_ids': ['mol_id'],
        'annotation_id':
        2,
        'off_sample_label':
        None,
        'off_sample_prob':
        None,
    }
Пример #9
0
def test_sm_daemon_es_export_fails(
    MSMSearchMock,
    post_images_to_image_store_mock,
    test_db,
    es_dsl_search,
    clean_isotope_storage,
    reset_queues,
    metadata,
    ds_config,
    queue_pub,
    local_sm_config,
):
    moldb = init_moldb()

    formula_metrics_df = pd.DataFrame({
        'formula_i': [0, 1, 2],
        'ion_formula': ['C12H24O-H2O+H', 'C12H24O-H2+O2-CO+Na', 'C12H24O+K'],
        'formula': ['C12H24O', 'C12H24O', 'C12H24O'],
        'modifier': ['-H2O+H', '-H2+O2-CO+Na', '+K'],
        'chem_mod': ['', '-H2+O2', ''],
        'neutral_loss': ['-H2O', '-CO', ''],
        'adduct': ['+H', '+Na', '+K'],
        'chaos': [0.9, 0.9, 0.9],
        'spatial': [0.9, 0.9, 0.9],
        'spectral': [0.9, 0.9, 0.9],
        'msm': [0.9**3, 0.9**3, 0.9**3],
        'total_iso_ints': [[100.0], [100.0], [100.0]],
        'min_iso_ints': [[0], [0], [0]],
        'max_iso_ints': [[10.0], [10.0], [10.0]],
        'fdr': [0.1, 0.1, 0.1],
    }).set_index('formula_i')
    search_algo_mock = MSMSearchMock()
    search_algo_mock.search.return_value = [
        (formula_metrics_df, [], create_test_fdr_diagnostics_bundle())
    ]
    search_algo_mock.metrics = OrderedDict([
        ('chaos', 0),
        ('spatial', 0),
        ('spectral', 0),
        ('msm', 0),
        ('total_iso_ints', []),
        ('min_iso_ints', []),
        ('max_iso_ints', []),
    ])
    image_ids = ['iso_image_1', None, None, None]
    post_images_to_image_store_mock.return_value = {
        0: image_ids,
        1: image_ids,
        2: image_ids
    }

    db = DB()

    def throw_exception_function(*args, **kwargs):
        raise Exception('Test')

    es = ESExporter(db, local_sm_config)
    es.index_ds = throw_exception_function

    ds = create_test_ds(
        name=test_ds_name,
        input_path=input_dir_path,
        config={
            **ds_config, 'database_ids': [moldb.id]
        },
        status=DatasetStatus.QUEUED,
        es=es,
    )

    queue_pub.publish({
        'ds_id': ds.id,
        'ds_name': test_ds_name,
        'action': DaemonAction.ANNOTATE
    })

    run_daemons(db, es, local_sm_config)

    # dataset and job tables asserts
    row = db.select_one('SELECT status from job')
    assert row[0] == 'FINISHED'
    row = db.select_one('SELECT status from dataset')
    assert row[0] == 'FAILED'
Пример #10
0
    def run(self, input_path, ds_config_path, clean=False):
        """ Entry point of the engine. Molecule search is completed in several steps:
         * Copying input data to the engine work dir
         * Conversion input data (imzML+ibd) to plain text format. One line - one spectrum data
         * Generation and saving to the database theoretical peaks for all formulas from the molecule database
         * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner.
         * Saving results (isotope images and their metrics of quality for each putative molecule) to the database

        Args
        -------
        input_path : string
            Path to the dataset folder with .imzML and .ibd files
        ds_config_path: string
            Path to the dataset config file
        clean : bool
            Clean all interim data files before starting molecule search
        """
        try:
            self.wd_manager = WorkDirManager(self.ds_name)
            if clean:
                self.wd_manager.clean()

            self.wd_manager.copy_input_data(input_path, ds_config_path)

            self._read_ds_config()
            logger.info('Dataset config:\n%s', pformat(self.ds_config))

            self._configure_spark()
            self._init_db()

            if not self.wd_manager.exists(self.wd_manager.txt_path):
                imzml_converter = ImzmlTxtConverter(
                    self.ds_name, self.wd_manager.local_dir.imzml_path,
                    self.wd_manager.local_dir.txt_path,
                    self.wd_manager.local_dir.coord_path)
                imzml_converter.convert()

                if not self.wd_manager.local_fs_only:
                    self.wd_manager.upload_to_remote()

            self.ds = Dataset(self.sc, self.ds_name, self.client_email,
                              input_path, self.ds_config, self.wd_manager,
                              self.db)
            self.ds.save_ds_meta()

            self.store_job_meta()

            theor_peaks_gen = TheorPeaksGenerator(self.sc, self.sm_config,
                                                  self.ds_config)
            theor_peaks_gen.run()

            target_adducts = self.ds_config['isotope_generation']['adducts']
            self.fdr = FDR(self.job_id,
                           self.sf_db_id,
                           decoy_sample_size=20,
                           target_adducts=target_adducts,
                           db=self.db)
            self.fdr.decoy_adduct_selection()
            self.formulas = FormulasSegm(self.job_id, self.sf_db_id,
                                         self.ds_config, self.db)

            # search_alg = MSMBasicSearch(self.sc, self.ds, self.formulas, self.fdr, self.ds_config)
            search_alg = MSMExtraFeats(self.sc, self.ds, self.formulas,
                                       self.fdr, self.ds_config)
            sf_metrics_df, sf_iso_images = search_alg.search()

            search_results = SearchResults(
                self.sf_db_id, self.ds_id, self.job_id, self.ds_name,
                self.formulas.get_sf_adduct_peaksn(), self.db, self.sm_config,
                self.ds_config)
            search_results.sf_metrics_df = sf_metrics_df
            search_results.sf_iso_images = sf_iso_images
            search_results.metrics = search_alg.metrics
            search_results.nrows, search_results.ncols = self.ds.get_dims()
            search_results.store()

            es = ESExporter(self.sm_config)
            es.index_ds(self.db, self.ds_name,
                        self.ds_config['database']['name'])

        except Exception:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            logger.error('\n'.join(
                traceback.format_exception(exc_type, exc_value,
                                           exc_traceback)))
        finally:
            if self.sc:
                # self.sc.show_profiles()
                self.sc.stop()
            if self.db:
                self.db.close()