예제 #1
0
    def test_add_optical_image(self, fill_db, sm_config, ds_config):
        db = DB(sm_config['db'])
        action_queue_mock = MagicMock(spec=QueuePublisher)
        es_mock = MagicMock(spec=ESExporter)
        img_store_mock = MagicMock(ImageStoreServiceWrapper)
        img_store_mock.post_image.side_effect = [
            'opt_img_id1', 'opt_img_id2', 'opt_img_id3', 'thumbnail_id'
        ]
        img_store_mock.get_image_by_id.return_value = Image.new(
            'RGB', (100, 100))

        ds_man = create_api_ds_man(sm_config=sm_config,
                                   db=db,
                                   es=es_mock,
                                   img_store=img_store_mock,
                                   annot_queue=action_queue_mock)
        ds_man._annotation_image_shape = MagicMock(return_value=(100, 100))

        ds_id = '2000-01-01'
        ds = create_ds(ds_id=ds_id, ds_config=ds_config)

        zoom_levels = [1, 2, 3]
        raw_img_id = 'raw_opt_img_id'
        ds_man.add_optical_image(ds,
                                 raw_img_id, [[1, 0, 0], [0, 1, 0], [0, 0, 1]],
                                 zoom_levels=zoom_levels)
        assert db.select('SELECT * FROM optical_image') == [
            ('opt_img_id{}'.format(i + 1), ds.id, zoom)
            for i, zoom in enumerate(zoom_levels)
        ]
        assert db.select('SELECT optical_image FROM dataset where id = %s',
                         params=(ds_id, )) == [(raw_img_id, )]
        assert db.select('SELECT thumbnail FROM dataset where id = %s',
                         params=(ds_id, )) == [('thumbnail_id', )]
예제 #2
0
def get_ds_moldb_ids(ds_id: str, status: Optional[str] = None):
    if status is not None:
        return DB().select_onecol(
            'SELECT j.moldb_id FROM job j WHERE ds_id = %s AND status = %s',
            (ds_id, status))
    return DB().select_onecol('SELECT j.moldb_id FROM job j WHERE ds_id = %s',
                              (ds_id, ))
예제 #3
0
def run_off_sample(sm_config, ds_ids_str, sql_where, fix_missing, overwrite_existing):
    db = DB()

    ds_ids = None
    if ds_ids_str:
        ds_ids = ds_ids_str.split(',')
    elif sql_where:
        ds_ids = [
            id for (id,) in db.select(f'SELECT DISTINCT dataset.id FROM dataset WHERE {sql_where}')
        ]
    elif fix_missing:
        logger.info('Checking for missing off-sample jobs...')
        results = db.select(MISSING_OFF_SAMPLE_SEL)
        ds_ids = [ds_id for ds_id, in results]
        logger.info(f'Found {len(ds_ids)} missing off-sample sets')

    if not ds_ids:
        logger.warning('No datasets match filter')
        return

    es_exp = ESExporter(db, sm_config)
    for i, ds_id in enumerate(ds_ids):
        try:
            logger.info(f'Running off-sample on {i+1} out of {len(ds_ids)}')
            ds = Dataset.load(db, ds_id)
            classify_dataset_ion_images(db, ds, sm_config['services'], overwrite_existing)
            es_exp.reindex_ds(ds_id)
        except Exception:
            logger.error(f'Failed to run off-sample on {ds_id}', exc_info=True)
예제 #4
0
def test_classify_ion_images_preds_saved(call_api_mock, image_storage_mock, fill_db):
    call_api_mock.return_value = {
        'predictions': [{'prob': 0.1, 'label': 'on'}, {'prob': 0.9, 'label': 'off'}]
    }

    fp = io.BytesIO()
    Image.new('RGBA', (10, 10)).save(fp, format='PNG')
    fp.seek(0)
    img_bytes = fp.read()
    image_storage_mock.get_image.return_value = img_bytes

    db = DB()
    ds_id = '2000-01-01'
    ds = Dataset.load(db, ds_id)

    services_config = defaultdict(str)
    classify_dataset_ion_images(db, ds, services_config)

    annotations = db.select_with_fields(
        (
            'select off_sample '
            'from dataset d '
            'join job j on j.ds_id = d.id '
            'join annotation m on m.job_id = j.id '
            'where d.id = %s '
            'order by m.id '
        ),
        params=(ds_id,),
    )
    exp_annotations = [
        {'off_sample': {'prob': 0.1, 'label': 'on'}},
        {'off_sample': {'prob': 0.9, 'label': 'off'}},
    ]
    assert annotations == exp_annotations
예제 #5
0
def test_create_moldb(fill_db, is_public):
    input_doc = moldb_input_doc(
        file_path=f's3://{BUCKET_NAME}/{MoldbFiles.VALID.value}',
        is_public=is_public)
    with patch_bottle_request(input_doc) as input_doc:

        resp = api.databases.create()

        assert resp['status'] == 'success'
        resp_doc = resp['data']

        db = DB()
        doc = db.select_one_with_fields(
            'SELECT id, name, version, group_id, is_public FROM molecular_db where id = %s',
            params=(resp_doc['id'], ),
        )
        for field in ['name', 'version', 'group_id', 'is_public']:
            assert doc[field] == input_doc[field]

        docs = db.select_with_fields(
            'SELECT * FROM molecule WHERE moldb_id = %s',
            params=(resp_doc['id'], ),
        )
        for doc in docs:
            print(doc)
            for field in ['mol_id', 'mol_name', 'formula', 'inchi']:
                assert field in doc
예제 #6
0
def test_save_sf_iso_images_correct_db_call(spark_context,
                                            create_fill_sm_database, sm_config,
                                            ds_config):
    sf_iso_imgs = spark_context.parallelize([((1, '+H'), [
        csr_matrix([[100, 0, 0], [0, 0, 0]]),
        csr_matrix([[0, 0, 0], [0, 0, 10]])
    ])])
    sf_adduct_peaksn = [(1, '+H', 2)]
    res = SearchResults(0, 0, 0, 'ds_name', sf_adduct_peaksn, db_mock,
                        sm_config, ds_config)
    res.sf_iso_images = sf_iso_imgs
    res.nrows, res.ncols = 2, 3
    res.store_sf_iso_images()

    correct_rows = [(0, 0, 1, '+H', 0, [0], [100], 0, 100),
                    (0, 0, 1, '+H', 1, [5], [10], 0, 10)]

    db = DB(sm_config['db'])
    try:
        rows = db.select((
            'SELECT job_id, db_id, sf_id, adduct, peak, pixel_inds, intensities, min_int, max_int '
            'FROM iso_image '
            'ORDER BY sf_id, adduct'))
        assert correct_rows == rows
    finally:
        db.close()
def update_optical_images(ds_id_str, sql_where):
    db = DB()

    if ds_id_str:
        ds_ids = ds_id_str.split(',')
    else:
        ds_ids = [
            id for (id, ) in db.select(
                f'SELECT DISTINCT dataset.id FROM dataset WHERE {sql_where}')
        ]

    for i, ds_id in enumerate(ds_ids):
        try:
            transform, img_id = db.select_one(
                'SELECT transform, optical_image from dataset WHERE id = %s',
                params=(ds_id, ))
            if img_id and transform:
                logger.info(
                    f'[{i + 1}/{len(ds_ids)}] Updating optical image of dataset {ds_id}'
                )
                add_optical_image(db, ds_id, img_id, transform)
            else:
                logger.info(
                    f'[{i + 1}/{len(ds_ids)}] Skipping dataset {ds_id}')
        except Exception:
            logger.error(f'Failed to update optical image on {ds_id}',
                         exc_info=True)
예제 #8
0
def reindex_results(sm_config, ds_id, ds_mask, use_inactive_index,
                    offline_reindex, update_fields):
    assert ds_id or ds_mask or offline_reindex

    IsocalcWrapper.set_centroids_cache_enabled(True)

    if offline_reindex:
        _reindex_all(sm_config)
    else:
        es_config = sm_config['elasticsearch']
        if use_inactive_index:
            es_config = get_inactive_index_es_config(es_config)

        db = DB()
        es_exp = ESExporter(db,
                            sm_config={
                                **sm_config, 'elasticsearch': es_config
                            })

        if ds_id:
            ds_ids = ds_id.split(',')
        elif ds_mask:
            ds_ids = [
                id for (id, ) in db.select(
                    "select id from dataset where name like '{}%'".format(
                        ds_mask))
            ]
        else:
            ds_ids = []

        if update_fields:
            _partial_update_datasets(ds_ids, es_exp, update_fields.split(','))
        else:
            _reindex_datasets(ds_ids, es_exp)
예제 #9
0
def run(sm_config, ds_id_str, sql_where, algorithm, use_lithops):
    db = DB()

    if sql_where:
        ds_ids = [
            id for (id, ) in db.select(
                f'SELECT DISTINCT dataset.id FROM dataset WHERE {sql_where}')
        ]
    else:
        ds_ids = ds_id_str.split(',')

    if not ds_ids:
        logger.warning('No datasets match filter')
        return

    if use_lithops:
        executor = Executor(sm_config['lithops'])

    for i, ds_id in enumerate(ds_ids):
        try:
            logger.info(
                f'[{i+1} / {len(ds_ids)}] Generating ion thumbnail for {ds_id}'
            )
            ds = Dataset.load(db, ds_id)
            if use_lithops:
                # noinspection PyUnboundLocalVariable
                generate_ion_thumbnail_lithops(executor,
                                               db,
                                               ds,
                                               algorithm=algorithm)
            else:
                generate_ion_thumbnail(db, ds, algorithm=algorithm)
        except Exception:
            logger.error(f'Failed on {ds_id}', exc_info=True)
예제 #10
0
def test_save_ds_meta_ds_doesnt_exist(spark_context, create_test_db, drop_test_db, sm_config, ds_config):
    work_dir_man_mock = MagicMock(WorkDirManager)
    work_dir_man_mock.ds_coord_path = '/ds_path'
    work_dir_man_mock.txt_path = '/txt_path'

    SMConfig._config_dict = sm_config

    with patch('sm.engine.tests.util.SparkContext.textFile') as m:
        m.return_value = spark_context.parallelize([
            '0,1,1\n',
            '1,100,200\n'])

        dataset = Dataset(spark_context, 'ds_name', '', ds_config, work_dir_man_mock, DB(sm_config['db']))
        dataset.save_ds_meta()

    db = DB(sm_config['db'])
    ds_row = db.select_one('SELECT name, file_path, img_bounds, config from dataset')
    assert ds_row == ('ds_name', '/txt_path',
                      {u'x': {u'min': 1, u'max': 100}, u'y': {u'min': 1, u'max': 200}},
                      ds_config)

    coord_row = db.select_one('SELECT xs, ys from coordinates')
    assert coord_row == ([1, 100], [1, 200])

    db.close()
예제 #11
0
    def __init__(self, sm_config, analysis_version, database):
        reports_path = Path(proj_root()) / 'tests/reports'
        timestamp = datetime.now().replace(microsecond=0).isoformat().replace(
            ':', '-')
        suffix = f'{database}-v{analysis_version}'

        self.sm_config = sm_config
        self.db = DB()

        self.ds_id = '2000-01-01_00h00m01s'
        self.ref_results_path = reports_path / f'spheroid-{suffix}.csv'
        self.output_results_path = reports_path / f'test-{suffix}-{timestamp}.csv'

        self.ds_name = 'sci_test_spheroid_untreated'
        self.ds_data_path = join(self.sm_config['fs']['spark_data_path'],
                                 self.ds_name)
        self.moldb = MOL_DBS[database]
        self.analysis_version = analysis_version
        self.input_path = join(proj_root(), 'tests/data/untreated')
        self.ds_config_path = join(self.input_path, 'config.json')
        self.metrics = [
            'chaos', 'spatial', 'spectral', 'mz_err_abs', 'mz_err_rel', 'msm',
            'fdr'
        ]

        self.comparison_df = None
예제 #12
0
def _upload_moldbs_from_db(moldb_ids, storage, sm_storage):
    moldb_defs = []
    bucket, prefix = sm_storage['moldb']
    # Sort the moldbs because the centroids cache key is affected by their order
    for moldb_id in sorted(moldb_ids):
        key = f'{prefix}/{moldb_id}'
        try:
            storage.head_object(bucket, key)
            logger.debug(f'Found mol db at {key}')
            cobject = CloudObject(storage.backend, bucket, key)
        except StorageNoSuchKeyError:
            logger.info(f'Uploading {key}...')
            mols_query = DB().select(
                'SELECT DISTINCT formula FROM molecule WHERE moldb_id = %s',
                (moldb_id, ))
            mols = [mol for mol, in mols_query]
            cobject = save_cobj(storage, mols, bucket=bucket, key=key)
            logger.info(f'Uploading {key}...Done')
        (targeted, ) = DB().select_one(
            'SELECT targeted FROM molecular_db WHERE id = %s', (moldb_id, ))
        moldb_defs.append({
            'id': moldb_id,
            'cobj': cobject,
            'targeted': targeted
        })

    return moldb_defs
예제 #13
0
def init_moldb():
    db = DB()
    moldb = create_test_molecular_db()
    db.insert(
        "INSERT INTO molecule (mol_id, mol_name, formula, moldb_id) VALUES (%s, %s, %s, %s)",
        rows=[('HMDB0001', 'molecule name', 'C12H24O', moldb.id)],
    )
    return moldb
    def test_delete_ds(self, EsMock, fill_db):
        db = DB()
        manager = create_daemon_man(db=db, es=EsMock())

        ds_id = '2000-01-01'
        ds = create_ds(ds_id=ds_id)

        manager.delete(ds)

        EsMock.return_value.delete_ds.assert_has_calls([call(ds_id)])
        assert db.select_one('SELECT * FROM dataset WHERE id = %s', params=(ds_id,)) == []
예제 #15
0
def reindex_all_results(conf):
    db = DB(conf['db'])
    es_exp = ESExporter(conf)

    es_exp.delete_index(name='sm')
    es_exp.create_index(name='sm')

    ds_db_pairs = db.select("select name, config -> 'database'::text -> 'name'::text from dataset")

    for ds_name, db_name in ds_db_pairs:
        es_exp.index_ds(db, ds_name, db_name)
예제 #16
0
def reindex_all_results(conf):
    db = DB(conf['db'])
    es_exp = ESExporter(conf)

    es_exp.delete_index(name='sm')
    es_exp.create_index(name='sm')

    ds_db_pairs = db.select(
        "select name, config -> 'database'::text -> 'name'::text from dataset")

    for ds_name, db_name in ds_db_pairs:
        es_exp.index_ds(db, ds_name, db_name)
예제 #17
0
def test_theor_peaks_generator_run_failed_iso_peaks(create_fill_test_db, spark_context, sm_config, ds_config):
    ds_config["isotope_generation"]["adducts"] = ["+Na"]
    theor_peaks_gen = TheorPeaksGenerator(spark_context, sm_config, ds_config)
    theor_peaks_gen.isocalc_wrapper.isotope_peaks = lambda *args: Centroids([], [])
    theor_peaks_gen.run()

    db = DB(sm_config['db'])
    rows = db.select('SELECT * FROM theor_peaks')

    assert len(rows) == 1

    db.close()
예제 #18
0
def test_create_moldb_malformed_csv(file, fill_db):
    input_doc = moldb_input_doc(file_path=f's3://{BUCKET_NAME}/{file.value}')
    with patch_bottle_request(input_doc):

        resp = api.databases.create()

        assert resp['status'] == MALFORMED_CSV['status']
        assert resp['error']

        db = DB()
        (db_count, ) = db.select_one(MOLDB_COUNT_SEL)
        assert db_count == 0
예제 #19
0
def test_delete_moldb(fill_db):
    input_doc = moldb_input_doc(
        file_path=f's3://{BUCKET_NAME}/{MoldbFiles.VALID.value}')
    moldb = create_test_molecular_db(**input_doc)
    with patch_bottle_request(req_doc={}):

        resp = api.databases.delete(moldb_id=moldb.id)

        assert resp['status'] == 'success'

        db = DB()
        (db_count, ) = db.select_one(MOLDB_COUNT_SEL)
        assert db_count == 0
def save_additional_info_to_db(db_id, user_id, input_path):
    conf = SMConfig.get_conf()
    with ConnectionPool(conf['db']):
        db = DB()
        if db.select_one('SELECT * FROM molecular_db WHERE id = %s',
                         (db_id, )):
            print(f'Updating existing molecular database {db_id}')
            DB().alter(
                'UPDATE molecular_db SET user_id = %s, input_path = %s WHERE id = %s',
                (user_id, input_path, db_id),
            )
        else:
            print(f'Specified molecular database {db_id} does not exist.')
예제 #21
0
def test_create_moldb_empty_values(fill_db):
    input_doc = moldb_input_doc(
        file_path=f's3://{BUCKET_NAME}/{MoldbFiles.EMPTY_VALUES.value}')
    with patch_bottle_request(input_doc):

        resp = api.databases.create()

        assert resp['status'] == BAD_DATA['status']
        assert resp['error'] and resp['details']

        db = DB()
        (db_count, ) = db.select_one(MOLDB_COUNT_SEL)
        assert db_count == 0
예제 #22
0
    def __init__(self, sc, sm_config, ds_config):
        self.sc = sc
        self.sm_config = sm_config
        self.ds_config = ds_config

        self.theor_peaks_tmp_dir = join(sm_config['fs']['base_path'],
                                        'tmp_theor_peaks_gen')
        self.db = DB(sm_config['db'])

        self.adducts = self.ds_config['isotope_generation']['adducts']

        self.isocalc_wrapper = IsocalcWrapper(
            self.ds_config['isotope_generation'])
예제 #23
0
def fill_db(test_db, sm_config, ds_config):
    upload_dt = '2000-01-01 00:00:00'
    ds_id = '2000-01-01'
    metadata = {"meta": "data"}
    db = DB(sm_config['db'])
    db.insert((
        'INSERT INTO dataset (id, name, input_path, upload_dt, metadata, config, status, '
        'is_public, mol_dbs, adducts) '
        'VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'),
              rows=[(ds_id, 'ds_name', 'input_path', upload_dt,
                     json.dumps(metadata), json.dumps(ds_config),
                     DatasetStatus.FINISHED, True, ['HMDB-v4'
                                                    ], ['+H', '+Na', '+K'])])
예제 #24
0
def test_sm_daemons_annot_fails(
    MSMSearchMock,
    post_images_to_image_store_mock,
    test_db,
    es_dsl_search,
    clean_isotope_storage,
    reset_queues,
    metadata,
    ds_config,
    queue_pub,
    local_sm_config,
):
    moldb = init_moldb()

    def throw_exception_function(*args, **kwargs):
        raise Exception('Test exception')

    msm_algo_mock = MSMSearchMock()
    msm_algo_mock.search.side_effect = throw_exception_function

    image_ids = ['iso_image_1', None, None, None]
    post_images_to_image_store_mock.return_value = {
        0: image_ids,
        1: image_ids,
        2: image_ids
    }

    db = DB()
    es = ESExporter(db, local_sm_config)
    ds = create_test_ds(
        name=test_ds_name,
        input_path=input_dir_path,
        config={
            **ds_config, 'database_ids': [moldb.id]
        },
        status=DatasetStatus.QUEUED,
        es=es,
    )

    queue_pub.publish({
        'ds_id': ds.id,
        'ds_name': test_ds_name,
        'action': DaemonAction.ANNOTATE
    })

    run_daemons(db, es, local_sm_config)

    # dataset and job tables asserts
    row = db.select_one('SELECT status from dataset')
    assert len(row) == 1
    assert row[0] == 'FAILED'
예제 #25
0
def _import_molecules(moldb, moldb_df, targeted_threshold):
    logger.info(f'{moldb}: importing {len(moldb_df)} molecules')

    columns = ['moldb_id', 'mol_id', 'mol_name', 'formula']
    buffer = StringIO()
    moldb_df = moldb_df.assign(moldb_id=int(moldb.id))
    moldb_df[columns].to_csv(buffer, sep='\t', index=False, header=False)
    buffer.seek(0)
    DB().copy(buffer, sep='\t', table='molecule', columns=columns)
    logger.info(f'{moldb}: inserted {len(moldb_df)} molecules')

    targeted = moldb_df.formula.unique().shape[0] <= targeted_threshold
    DB().alter('UPDATE molecular_db SET targeted = %s WHERE id = %s',
               params=(targeted, moldb.id))
예제 #26
0
    def __init__(self, sm_config_path):
        self.sm_config_path = sm_config_path
        self.sm_config = SMConfig.get_conf()
        self.db = DB(self.sm_config['db'])

        self.ds_id = '2000-01-01-00_00_00'
        self.base_search_res_path = join(proj_root(), 'tests/reports',
                                         'spheroid_untreated_search_res.csv')
        self.ds_name = 'sci_test_spheroid_untreated'
        self.data_dir_path = join(self.sm_config['fs']['base_path'],
                                  self.ds_name)
        self.input_path = join(proj_root(), 'tests/data/untreated')
        self.ds_config_path = join(self.input_path, 'config.json')
        self.metrics = ['chaos', 'spatial', 'spectral']
def migrate_optical_images(ds_id):
    output.print('Migrating optical images')

    with timeit():
        output.print('Transferring images and updating database...')
        db = DB()
        rows = db.select(SEL_OPTICAL_IMGS, params=(ds_id,))
        for opt_image_id, opt_image_url in rows:
            if not opt_image_url and opt_image_id:
                transfer_images(
                    ds_id,
                    'optical_images',
                    image_storage.OPTICAL,
                    [opt_image_id],
                )
                opt_image_url = image_storage.get_image_url(
                    image_storage.OPTICAL, ds_id, opt_image_id
                )
                db.alter(UPD_OPTICAL_IMGS, params=(opt_image_url, opt_image_id))

        opt_thumb_id, opt_thumb_url = db.select_one(SEL_OPT_THUMB, params=(ds_id,))
        if not opt_thumb_url and opt_thumb_id:
            transfer_images(
                ds_id,
                'optical_images',
                image_storage.OPTICAL,
                [opt_thumb_id],
            )
            opt_thumb_url = image_storage.get_image_url(image_storage.OPTICAL, ds_id, opt_thumb_id)
            db.alter(UPD_OPT_THUMB, params=(opt_thumb_url, ds_id))
예제 #28
0
def fill_test_db(create_test_db, drop_test_db):
    db_config = dict(database='sm_test',
                     user='******',
                     host='localhost',
                     password='******')
    db = DB(db_config)
    try:
        db.alter('TRUNCATE dataset CASCADE')
        db.insert("INSERT INTO dataset VALUES (%s, %s, %s, %s, %s)",
                  [(1, 'ds_name', '/ds_path', json.dumps({}), json.dumps({}))])
        db.alter('TRUNCATE coordinates CASCADE')
    except:
        raise
    finally:
        db.close()
예제 #29
0
def create_fill_sm_database(create_test_db, drop_test_db, sm_config):
    local('psql -h localhost -U sm sm_test < {}'.format(join(proj_dir_path, 'scripts/create_schema.sql')))

    db = DB(sm_config['db'])
    try:
        db.insert('INSERT INTO formula_db VALUES (%s, %s, %s)',
                  [(0, '2016-01-01', 'HMDB')])
        db.insert('INSERT INTO formula VALUES (%s, %s, %s, %s, %s)',
                  [(100, 0, '00001', 'compound_name', 'C12H24O')])
        db.insert('INSERT INTO agg_formula VALUES (%s, %s, %s, %s, %s)',
                  [(10007, 0, 'C12H24O', ['00001'], ['compound_name'])])
    except:
        raise
    finally:
        db.close()
예제 #30
0
def test_sm_daemons_annot_fails(get_compute_img_metrics_mock, filter_sf_metrics_mock,
                                post_images_to_annot_service_mock,
                                MolDBServiceWrapperMock,
                                sm_config, test_db, es_dsl_search,
                                clean_isotope_storage):
    init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock)

    def throw_exception_function(*args):
        raise Exception('Test')
    get_compute_img_metrics_mock.return_value = throw_exception_function
    filter_sf_metrics_mock.side_effect = lambda x: x

    url_dict = {
        'iso_image_ids': ['iso_image_1', None, None, None]
    }
    post_images_to_annot_service_mock.return_value = {
        35: url_dict,
        44: url_dict
    }

    db = DB(sm_config['db'])
    es = ESExporter(db)
    annotate_daemon = None

    try:
        ds_id = '2000-01-01_00h00m'
        upload_dt = datetime.now()
        ds_config_str = open(ds_config_path).read()
        db.insert(Dataset.DS_INSERT, [{
            'id': ds_id,
            'name': test_ds_name,
            'input_path': input_dir_path,
            'upload_dt': upload_dt,
            'metadata': '{}',
            'config': ds_config_str,
            'status': DatasetStatus.QUEUED,
            'is_public': True,
            'mol_dbs': ['HMDB-v4'],
            'adducts': ['+H', '+Na', '+K'],
            'ion_img_storage': 'fs'
        }])

        ds = Dataset.load(db, ds_id)
        queue_pub.publish({'ds_id': ds.id, 'ds_name': ds.name, 'action': 'annotate'})

        run_daemons(db, es)

        # dataset and job tables asserts
        row = db.select_one('SELECT status from dataset')
        assert row[0] == 'FAILED'
        row = db.select_one('SELECT status from job')
        assert row[0] == 'FAILED'
    finally:
        db.close()
        if annotate_daemon:
            annotate_daemon.stop()
        with warn_only():
            local('rm -rf {}'.format(data_dir_path))
예제 #31
0
def fill_test_db(create_test_db, drop_test_db):
    db_config = dict(database='sm_test', user='******', host='localhost', password='******')
    db = DB(db_config)
    try:
        db.alter('TRUNCATE dataset CASCADE')
        db.insert("INSERT INTO dataset VALUES (%s, %s, %s, %s, %s)",
                  [(1, 'ds_name', '/ds_path', json.dumps({}), json.dumps({}))])
        db.alter('TRUNCATE coordinates CASCADE')
    except:
        raise
    finally:
        db.close()
def migrate_isotopic_images(ds_id):
    output.print('Migrating isotopic images')

    db = DB()
    image_ids = db.select_onecol(SEL_DS_IMG_IDS, params=(ds_id,))
    es_exporter = ESExporter(db, sm_config)
    if image_ids and not _es_docs_migrated(es_exporter._es, ds_id):

        with timeit():
            output.print('Transferring images...')
            output.print(len(image_ids))
            transfer_images(ds_id, 'iso_images', image_storage.ISO, image_ids)

        with timeit():
            output.print('Reindexing ES documents...')
            es_exporter.reindex_ds(ds_id)
def migrate_ion_thumbnail(ds_id):
    output.print('Migrating ion thumbnail images')

    with timeit():
        output.print('Transferring images and updating database...')
        db = DB()
        ion_thumb_id, ion_thumbnail_url = db.select_one(SEL_ION_THUMB, params=(ds_id,))
        if not ion_thumbnail_url and ion_thumb_id:
            transfer_images(
                ds_id,
                'ion_thumbnails',
                image_storage.THUMB,
                [ion_thumb_id],
            )
            ion_thumb_url = image_storage.get_image_url(image_storage.THUMB, ds_id, ion_thumb_id)
            db.alter(UPD_ION_THUMB, params=(ion_thumb_url, ds_id))
class SciTester(object):

    def __init__(self, db_config):
        self.db = DB(db_config)
        self.base_search_res_path = join(proj_root(), 'tests/reports', 'spheroid_12h_search_res.csv')
        self.metrics = ['chaos', 'spatial', 'spectral']

    def metr_dict_to_array(self, metr_d):
        return np.array([metr_d[m] for m in self.metrics])

    def read_base_search_res(self):
        with open(self.base_search_res_path) as f:
            rows = map(lambda line: line.strip('\n').split('\t'), f.readlines()[1:])
            return {(r[0], r[1]): np.array(r[2:], dtype=float) for r in rows}

    def fetch_search_res(self):
        rows = self.db.select(SEARCH_RES_SELECT, ds_name, 'HMDB')
        return {(r[0], r[1]): self.metr_dict_to_array(r[2]) for r in rows}

    def run_sci_test(self):
        compare_search_results(self.read_base_search_res(), self.fetch_search_res())

    def save_sci_test_report(self):
        with open(self.base_search_res_path, 'w') as f:
            f.write('\t'.join(['sf', 'adduct'] + self.metrics) + '\n')
            for (sf, adduct), metrics in sorted(self.fetch_search_res().iteritems()):
                f.write('\t'.join([sf, adduct] + metrics.astype(str).tolist()) + '\n')

        print 'Successfully saved sample dataset search report'
예제 #35
0
def create_test_db():
    db_config = dict(database='postgres', user='******', host='localhost')
    db = DB(db_config, autocommit=True)
    db.alter('DROP DATABASE IF EXISTS sm_test')
    db.alter('CREATE DATABASE sm_test')
    db.close()

    local('psql -h localhost -U sm sm_test < {}'.format(join(proj_root(), 'scripts/create_schema.sql')))
def create_fill_sm_database(create_test_db, drop_test_db, sm_config):
    local('psql -h localhost -U sm sm_test < {}'.format(join(proj_dir_path, 'scripts/create_schema.sql')))

    db = DB(sm_config['db'])
    try:
        db.insert('INSERT INTO formula_db VALUES (%s, %s, %s)',
                  [(0, '2016-01-01', 'HMDB')])
        db.insert('INSERT INTO formula VALUES (%s, %s, %s, %s, %s)',
                  [(100, 0, '00001', 'compound_name', 'C12H24O')])
        db.insert('INSERT INTO agg_formula VALUES (%s, %s, %s, %s, %s)',
                  [(10007, 0, 'C12H24O', ['00001'], ['compound_name'])])
    except:
        raise
    finally:
        db.close()
예제 #37
0
    def __init__(self, sc, sm_config, ds_config):  # TODO: replace sm_config with db
        self.sc = sc
        self.sm_config = sm_config
        self.ds_config = ds_config

        self.theor_peaks_tmp_dir = join(sm_config['fs']['base_path'], 'tmp_theor_peaks_gen')
        self.db = DB(sm_config['db'])

        self.adducts = self.ds_config['isotope_generation']['adducts']

        self.isocalc_wrapper = IsocalcWrapper(self.ds_config['isotope_generation'])
    def __init__(self, sm_config_path):
        self.sm_config_path = sm_config_path
        self.sm_config = SMConfig.get_conf()
        self.db = DB(self.sm_config['db'])

        self.ds_id = '2000-01-01-00_00_00'
        self.base_search_res_path = join(proj_root(), 'tests/reports', 'spheroid_untreated_search_res.csv')
        self.ds_name = 'sci_test_spheroid_untreated'
        self.data_dir_path = join(self.sm_config['fs']['base_path'], self.ds_name)
        self.input_path = join(proj_root(), 'tests/data/untreated')
        self.ds_config_path = join(self.input_path, 'config.json')
        self.metrics = ['chaos', 'spatial', 'spectral']
def test_search_job_imzml_example_es_export_fails(get_compute_img_metrics_mock, filter_sf_metrics_mock,
                                                  post_images_to_annot_service_mock,
                                                  MolDBServiceWrapperMock, MolDBServiceWrapperMock2,
                                                  sm_config, create_fill_sm_database, es_dsl_search,
                                                  clean_isotope_storage):
    init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock)
    init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock2)

    get_compute_img_metrics_mock.return_value = lambda *args: (0.9, 0.9, 0.9, [100.], [0], [10.])
    filter_sf_metrics_mock.side_effect = lambda x: x

    url_dict = {
        'iso_image_ids': ['iso_image_1', None, None, None]
    }
    post_images_to_annot_service_mock.return_value = {
        35: url_dict,
        44: url_dict
    }

    db = DB(sm_config['db'])

    def throw_exception_function(*args):
        raise Exception('Test')

    try:
        ds_id = '2000-01-01_00h00m'
        upload_dt = datetime.now()
        ds_config_str = open(ds_config_path).read()
        db.insert(Dataset.DS_INSERT, [{
            'id': ds_id,
            'name': test_ds_name,
            'input_path': input_dir_path,
            'upload_dt': upload_dt,
            'metadata': '{}',
            'config': ds_config_str,
            'status': DatasetStatus.QUEUED,
            'is_public': True,
            'mol_dbs': ['HMDB-v4'],
            'adducts': ['+H', '+Na', '+K'],
            'ion_img_storage': 'fs'
        }])

        with patch('sm.engine.search_job.ESExporter.index_ds') as index_ds_mock:
            index_ds_mock.side_effect = throw_exception_function

            img_store = ImageStoreServiceWrapper(sm_config['services']['img_service_url'])
            job = SearchJob(img_store=img_store)
            ds = Dataset.load(db, ds_id)
            job.run(ds)
    except ESExportFailedError as e:
        assert e
        # dataset table asserts
        row = db.select_one('SELECT status from dataset')
        assert row[0] == 'FAILED'
    else:
        raise AssertionError('ESExportFailedError should be raised')
    finally:
        db.close()
        with warn_only():
            local('rm -rf {}'.format(data_dir_path))
def test_search_job_imzml_example(get_compute_img_measures_mock, create_fill_sm_database, sm_config):
    get_compute_img_measures_mock.return_value = lambda *args: (0.9, 0.9, 0.9)

    SMConfig._config_dict = sm_config

    db = DB(sm_config['db'])
    try:
        job = SearchJob(None, 'imzml_example_ds')
        job.run(input_dir_path, ds_config_path, clean=True)

        # dataset meta asserts
        rows = db.select("SELECT name, file_path, img_bounds from dataset")
        img_bounds = {u'y': {u'max': 3, u'min': 1}, u'x': {u'max': 3, u'min': 1}}
        file_path = 'file://' + join(data_dir_path, 'ds.txt')
        assert len(rows) == 1
        assert rows[0] == (test_ds_name, file_path, img_bounds)

        # theoretical patterns asserts
        rows = db.select('SELECT db_id, sf_id, adduct, centr_mzs, centr_ints, prof_mzs, prof_ints '
                         'FROM theor_peaks '
                         'ORDER BY adduct')

        assert len(rows) == 3 + len(DECOY_ADDUCTS)
        for r in rows:
            assert r[3] and r[4]

        # image metrics asserts
        rows = db.select(('SELECT db_id, sf_id, adduct, peaks_n, stats FROM iso_image_metrics '
                          'ORDER BY sf_id, adduct'))

        assert rows
        assert rows[0]
        assert tuple(rows[0][:2]) == (0, 10007)
        assert set(rows[0][4].keys()) == {'chaos', 'spatial', 'spectral'}

        # image asserts
        rows = db.select(('SELECT db_id, sf_id, adduct, peak, intensities, min_int, max_int '
                          'FROM iso_image '
                          'ORDER BY sf_id, adduct'))
        assert rows

        max_int = 0.0
        for r in rows:
            max_int = max(max_int, r[-1])
            assert tuple(r[:2]) == (0, 10007)
        assert max_int

    finally:
        db.close()
        with warn_only():
            local('rm -rf {}'.format(data_dir_path))
class SearchJob(object):
    """ Main class responsible for molecule search. Uses other modules of the engine.

    Args
    ----
    no_clean : bool
        Don't delete interim data files
    """
    def __init__(self, img_store=None, no_clean=False):
        self.no_clean = no_clean
        self._img_store = img_store

        self._job_id = None
        self._sc = None
        self._db = None
        self._ds = None
        self._ds_reader = None
        self._status_queue = None
        self._fdr = None
        self._wd_manager = None
        self._es = None

        self._sm_config = SMConfig.get_conf()

        logger.debug('Using SM config:\n%s', pformat(self._sm_config))

    def _configure_spark(self):
        logger.info('Configuring Spark')
        sconf = SparkConf()
        for prop, value in self._sm_config['spark'].items():
            if prop.startswith('spark.'):
                sconf.set(prop, value)

        if 'aws' in self._sm_config:
            sconf.set("spark.hadoop.fs.s3a.access.key", self._sm_config['aws']['aws_access_key_id'])
            sconf.set("spark.hadoop.fs.s3a.secret.key", self._sm_config['aws']['aws_secret_access_key'])
            sconf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
            sconf.set("spark.hadoop.fs.s3a.endpoint", "s3.{}.amazonaws.com".format(self._sm_config['aws']['aws_region']))

        self._sc = SparkContext(master=self._sm_config['spark']['master'], conf=sconf, appName='SM engine')

    def _init_db(self):
        logger.info('Connecting to the DB')
        self._db = DB(self._sm_config['db'])

    def store_job_meta(self, mol_db_id):
        """ Store search job metadata in the database """
        logger.info('Storing job metadata')
        rows = [(mol_db_id, self._ds.id, 'STARTED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))]
        self._job_id = self._db.insert_return(JOB_INS, rows=rows)[0]

    def _run_annotation_job(self, mol_db):
        try:
            self.store_job_meta(mol_db.id)
            mol_db.set_job_id(self._job_id)

            logger.info("Running new job ds_id: %s, ds_name: %s, db_name: %s, db_version: %s",
                        self._ds.id, self._ds.name, mol_db.name, mol_db.version)

            target_adducts = self._ds.config['isotope_generation']['adducts']
            self._fdr = FDR(job_id=self._job_id,
                            decoy_sample_size=20,
                            target_adducts=target_adducts,
                            db=self._db)

            isocalc = IsocalcWrapper(self._ds.config['isotope_generation'])
            centroids_gen = IonCentroidsGenerator(sc=self._sc, moldb_name=mol_db.name, isocalc=isocalc)
            polarity = self._ds.config['isotope_generation']['charge']['polarity']
            all_adducts = list(set(self._sm_config['defaults']['adducts'][polarity]) | set(DECOY_ADDUCTS))
            centroids_gen.generate_if_not_exist(isocalc=isocalc,
                                                sfs=mol_db.sfs,
                                                adducts=all_adducts)
            target_ions = centroids_gen.ions(target_adducts)
            self._fdr.decoy_adducts_selection(target_ions)

            search_alg = MSMBasicSearch(sc=self._sc, ds=self._ds, ds_reader=self._ds_reader,
                                        mol_db=mol_db, centr_gen=centroids_gen,
                                        fdr=self._fdr, ds_config=self._ds.config)
            ion_metrics_df, ion_iso_images = search_alg.search()

            search_results = SearchResults(mol_db.id, self._job_id, search_alg.metrics.keys())
            mask = self._ds_reader.get_2d_sample_area_mask()
            img_store_type = self._ds.get_ion_img_storage_type(self._db)
            search_results.store(ion_metrics_df, ion_iso_images, mask, self._db, self._img_store, img_store_type)
        except Exception as e:
            self._db.alter(JOB_UPD, params=('FAILED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id))
            msg = 'Job failed(ds_id={}, mol_db={}): {}'.format(self._ds.id, mol_db, str(e))
            raise JobFailedError(msg) from e
        else:
            self._export_search_results_to_es(mol_db, isocalc)

    def _export_search_results_to_es(self, mol_db, isocalc):
        try:
            self._es.index_ds(self._ds.id, mol_db, isocalc)
        except Exception as e:
            self._db.alter(JOB_UPD, params=('FAILED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id))
            msg = 'Export to ES failed(ds_id={}, mol_db={}): {}'.format(self._ds.id, mol_db, str(e))
            raise ESExportFailedError(msg) from e
        else:
            self._db.alter(JOB_UPD, params=('FINISHED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id))

    def _remove_annotation_job(self, mol_db):
        logger.info("Removing job results ds_id: %s, ds_name: %s, db_name: %s, db_version: %s",
                    self._ds.id, self._ds.name, mol_db.name, mol_db.version)
        self._db.alter('DELETE FROM job WHERE ds_id = %s and db_id = %s', params=(self._ds.id, mol_db.id))
        self._es.delete_ds(self._ds.id, mol_db)

    def _moldb_ids(self):
        moldb_service = MolDBServiceWrapper(self._sm_config['services']['mol_db'])
        completed_moldb_ids = {moldb_service.find_db_by_id(db_id)['id']
                               for (_, db_id) in self._db.select(JOB_ID_MOLDB_ID_SEL, params=(self._ds.id,))}
        new_moldb_ids = {moldb_service.find_db_by_name_version(moldb_name)[0]['id']
                         for moldb_name in self._ds.config['databases']}
        return completed_moldb_ids, new_moldb_ids

    def _save_data_from_raw_ms_file(self):
        ms_file_type_config = SMConfig.get_ms_file_handler(self._wd_manager.local_dir.ms_file_path)
        acq_geometry_factory_module = ms_file_type_config['acq_geometry_factory']
        acq_geometry_factory = getattr(import_module(acq_geometry_factory_module['path']),
                                                acq_geometry_factory_module['name'])

        acq_geometry = acq_geometry_factory(self._wd_manager.local_dir.ms_file_path).create()
        self._ds.save_acq_geometry(self._db, acq_geometry)

        self._ds.save_ion_img_storage_type(self._db, ms_file_type_config['img_storage_type'])

    def run(self, ds):
        """ Entry point of the engine. Molecule search is completed in several steps:
            * Copying input data to the engine work dir
            * Conversion input mass spec files to plain text format. One line - one spectrum data
            * Generation and saving to the database theoretical peaks for all formulas from the molecule database
            * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner.
            * Saving results (isotope images and their metrics of quality for each putative molecule) to the database

        Args
        ----
            ds : sm.engine.dataset_manager.Dataset
        """
        try:
            start = time.time()

            self._init_db()
            self._es = ESExporter(self._db)
            self._ds = ds

            if self._sm_config['rabbitmq']:
                self._status_queue = QueuePublisher(config=self._sm_config['rabbitmq'],
                                                    qdesc=SM_DS_STATUS,
                                                    logger=logger)
            else:
                self._status_queue = None
            ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.STARTED)

            self._wd_manager = WorkDirManager(ds.id)
            self._configure_spark()

            if not self.no_clean:
                self._wd_manager.clean()

            self._ds_reader = DatasetReader(self._ds.input_path, self._sc, self._wd_manager)
            self._ds_reader.copy_convert_input_data()

            self._save_data_from_raw_ms_file()
            self._img_store.storage_type = self._ds.get_ion_img_storage_type(self._db)

            ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.STARTED)

            logger.info('Dataset config:\n%s', pformat(self._ds.config))

            completed_moldb_ids, new_moldb_ids = self._moldb_ids()
            for moldb_id in completed_moldb_ids.symmetric_difference(new_moldb_ids):  # ignore ids present in both sets
                mol_db = MolecularDB(id=moldb_id, db=self._db,
                                     iso_gen_config=self._ds.config['isotope_generation'])
                if moldb_id not in new_moldb_ids:
                    self._remove_annotation_job(mol_db)
                elif moldb_id not in completed_moldb_ids:
                    self._run_annotation_job(mol_db)

            ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.FINISHED)

            logger.info("All done!")
            time_spent = time.time() - start
            logger.info('Time spent: %d mins %d secs', *divmod(int(round(time_spent)), 60))
        except Exception as e:
            if self._ds:
                ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.FAILED)
            logger.error(e, exc_info=True)
            raise
        finally:
            if self._sc:
                self._sc.stop()
            if self._db:
                self._db.close()
            if self._wd_manager and not self.no_clean:
                self._wd_manager.clean()
            logger.info('*' * 150)
class SciTester(object):

    def __init__(self, sm_config_path):
        self.sm_config_path = sm_config_path
        self.sm_config = SMConfig.get_conf()
        self.db = DB(self.sm_config['db'])

        self.ds_id = '2000-01-01-00_00_00'
        self.base_search_res_path = join(proj_root(), 'tests/reports', 'spheroid_untreated_search_res.csv')
        self.ds_name = 'sci_test_spheroid_untreated'
        self.data_dir_path = join(self.sm_config['fs']['base_path'], self.ds_name)
        self.input_path = join(proj_root(), 'tests/data/untreated')
        self.ds_config_path = join(self.input_path, 'config.json')
        self.metrics = ['chaos', 'spatial', 'spectral']

    def metr_dict_to_array(self, metr_d):
        metric_array = np.array([metr_d[m] for m in self.metrics])
        return np.hstack([metric_array, metric_array.prod()])

    def read_base_search_res(self):
        def prep_metric_arrays(a):
            return np.array(a, dtype=float)

        with open(self.base_search_res_path) as f:
            rows = map(lambda line: line.strip('\n').split('\t'), f.readlines()[1:])
            return {(r[0], r[1]):  prep_metric_arrays(r[2:]) for r in rows}

    def fetch_search_res(self):
        mol_db_service = MolDBServiceWrapper(self.sm_config['services']['mol_db'])
        mol_db_id = mol_db_service.find_db_by_name_version('HMDB-v2.5')[0]['id']
        rows = self.db.select(SEARCH_RES_SELECT, params=(mol_db_id, self.ds_name))
        return {(r[0], r[1]): self.metr_dict_to_array(r[2]) for r in rows}

    def save_sci_test_report(self):
        with open(self.base_search_res_path, 'w') as f:
            f.write('\t'.join(['sf', 'adduct'] + self.metrics) + '\n')
            for (sf, adduct), metrics in sorted(self.fetch_search_res().items()):
                f.write('\t'.join([sf, adduct] + metrics.astype(str).tolist()) + '\n')

        print('Successfully saved sample dataset search report')

    @staticmethod
    def print_metric_hist(metric_arr, bins=10):
        metric_freq, metric_interv = np.histogram(metric_arr, bins=np.linspace(-1, 1, 21))
        metric_interv = [round(x, 2) for x in metric_interv]
        pprint(list(zip(zip(metric_interv[:-1], metric_interv[1:]), metric_freq)))

    def report_metric_differences(self, metrics_array):
        metrics_array = np.array(metrics_array)
        print("\nCHAOS HISTOGRAM")
        self.print_metric_hist(metrics_array[:, 0])
        print("\nIMG_CORR HISTOGRAM")
        self.print_metric_hist(metrics_array[:, 1])
        print("\nPAT_MATCH HISTOGRAM")
        self.print_metric_hist(metrics_array[:, 2])
        print("\nMSM HISTOGRAM")
        self.print_metric_hist(metrics_array[:, 3])

    def _missed_formulas(self, old, new):
        missed_sf_adduct = set(old.keys()) - set(new.keys())
        print('MISSED FORMULAS: {:.1f}%'.format(len(missed_sf_adduct) / len(old) * 100))
        if missed_sf_adduct:
            missed_sf_base_metrics = np.array([old[k] for k in missed_sf_adduct])
            self.report_metric_differences(missed_sf_base_metrics)
        return bool(missed_sf_adduct)

    def _false_discovery(self, old, new):
        new_sf_adduct = set(new.keys()) - set(old.keys())
        print('\nFALSE DISCOVERY: {:.1f}%'.format(len(new_sf_adduct) / len(old) * 100))

        if new_sf_adduct:
            for sf_adduct in new_sf_adduct:
                metrics = new[sf_adduct]
                print('{} metrics = {}'.format(sf_adduct, metrics))
        return bool(new_sf_adduct)

    def _metrics_diff(self, old, new):
        print('\nDIFFERENCE IN METRICS:')
        metric_diffs = []
        for b_sf_add, b_metr in old.items():
            if b_sf_add in new.keys():
                metr = new[b_sf_add]
                diff = b_metr - metr
                if np.any(np.abs(diff) > 1e-6):
                    metric_diffs.append(diff)
                    print('{} metrics diff = {}'.format(b_sf_add, diff))

        if metric_diffs:
            self.report_metric_differences(metric_diffs)
        return bool(metric_diffs)

    def search_results_are_different(self):
        old_search_res = self.read_base_search_res()
        search_res = self.fetch_search_res()
        return (self._missed_formulas(old_search_res, search_res) or
                self._false_discovery(old_search_res, search_res) or
                self._metrics_diff(old_search_res, search_res))

    def _create_img_store_mock(self):

        class ImageStoreMock(object):
            def post_image(self, *args):
                return None

            def delete_image_by_id(self, *args):
                return None

        return ImageStoreMock()

    def run_search(self, mock_img_store=False):
        if mock_img_store:
            img_store = self._create_img_store_mock()
        else:
            img_store = ImageStoreServiceWrapper(self.sm_config['services']['img_service_url'])
        ds_man = SMDaemonDatasetManager(db=self.db, es=ESExporter(self.db),
                                        img_store=img_store, mode='local')

        ds = create_ds_from_files(self.ds_id, self.ds_name, self.input_path)
        from sm.engine import SearchJob
        ds_man.add(ds, search_job_factory=SearchJob, del_first=True)

    def clear_data_dirs(self):
        with warn_only():
            local('rm -rf {}'.format(self.data_dir_path))
 def __init__(self, db_config):
     self.db = DB(db_config)
     self.base_search_res_path = join(proj_root(), 'tests/reports', 'spheroid_12h_search_res.csv')
     self.metrics = ['chaos', 'spatial', 'spectral']
예제 #44
0
              "JOIN agg_formula f ON f.id = m.sf_id AND sf_db.id = f.db_id "
              "JOIN job j ON j.id = m.job_id "
              "JOIN dataset ds ON ds.id = j.ds_id "
              "JOIN theor_peaks tp ON tp.db_id = sf_db.id AND tp.sf_id = m.sf_id AND tp.adduct = m.adduct "
              "WHERE sf_db.name = %s AND ds.name = %s "
              "AND ROUND(sigma::numeric, 6) = %s AND charge = %s AND pts_per_mz = %s")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Exporting search results into a csv file')
    parser.add_argument('ds_name', type=str, help='Dataset name')
    parser.add_argument('csv_path', type=str, help='Path for the csv file')
    parser.add_argument('--config', dest='sm_config_path', type=str, help='SM config path')
    parser.set_defaults(sm_config_path=path.join(proj_root(), 'conf/config.json'))
    args = parser.parse_args()

    SMConfig.set_path(args.sm_config_path)
    db = DB(SMConfig.get_conf()['db'])

    ds_config = db.select_one(DS_CONFIG_SEL, args.ds_name)[0]
    isotope_gen_config = ds_config['isotope_generation']
    charge = '{}{}'.format(isotope_gen_config['charge']['polarity'], isotope_gen_config['charge']['n_charges'])
    export_rs = db.select(EXPORT_SEL, ds_config['database']['name'], args.ds_name,
                          isotope_gen_config['isocalc_sigma'], charge, isotope_gen_config['isocalc_pts_per_mz'])

    header = ','.join(['formula_db', 'ds_name', 'sf', 'adduct', 'chaos', 'img_corr', 'pat_match',
                       'isocalc_sigma', 'isocalc_charge', 'isocalc_pts_per_mz', 'first_peak_mz']) + '\n'
    with open(args.csv_path, 'w') as f:
        f.write(header)
        f.writelines([','.join(map(str, row)) + '\n' for row in export_rs])
    logger.info('Exported all search results for "%s" dataset into "%s" file', args.ds_name, args.csv_path)
예제 #45
0
class TheorPeaksGenerator(object):
    """ Generator of theoretical isotope peaks for all molecules in a database.

    Args
    ----------
    sc : pyspark.SparkContext
    sm_config : dict
        SM engine config
    ds_config : dict
        Dataset config
    """
    def __init__(self, sc, sm_config, ds_config):  # TODO: replace sm_config with db
        self.sc = sc
        self.sm_config = sm_config
        self.ds_config = ds_config

        self.theor_peaks_tmp_dir = join(sm_config['fs']['base_path'], 'tmp_theor_peaks_gen')
        self.db = DB(sm_config['db'])

        self.adducts = self.ds_config['isotope_generation']['adducts']

        self.isocalc_wrapper = IsocalcWrapper(self.ds_config['isotope_generation'])

    @staticmethod
    def _sf_elements(sf):
        return [seg.element().name() for seg in parseSumFormula(sf).get_segments()]

    @classmethod
    def _valid_sf_adduct(cls, sf, adduct):
        if sf is None or adduct is None or sf == 'None' or adduct == 'None':
            logger.warning('Invalid sum formula or adduct: sf=%s, adduct=%s', sf, adduct)
            return False

        if '-' in adduct and adduct.strip('-') not in cls._sf_elements(sf):
            logger.info('No negative adduct element in the sum formula: sf=%s, adduct=%s', sf, adduct)
            return False

        return True

    def run(self):
        """ Starts peaks generation. Checks all formula peaks saved in the database and
        generates peaks only for new ones"""
        logger.info('Running theoretical peaks generation')

        db_id = self.db.select_one(DB_ID_SEL, self.ds_config['database']['name'])[0]
        formula_list = self.apply_database_filters(self.db.select(AGG_FORMULA_SEL, db_id))

        stored_sf_adduct = self.db.select(SF_ADDUCT_SEL, db_id,
                                          self.isocalc_wrapper.sigma,
                                          self.isocalc_wrapper.charge,
                                          self.isocalc_wrapper.pts_per_mz)

        sf_adduct_cand = self.find_sf_adduct_cand(formula_list, set(stored_sf_adduct))
        logger.info('%d saved (sf, adduct)s, %s not saved (sf, adduct)s', len(stored_sf_adduct), len(sf_adduct_cand))

        if sf_adduct_cand:
            self.generate_theor_peaks(sf_adduct_cand)

    def apply_database_filters(self, formula_list):
        """ Filters according to settings in dataset config

        Args
        ----
        formula_list : list
            List of pairs (id, sum formula) to search through

        Returns
        -------
        : list
            Filtered list of pairs (id, sum formula)
        """
        if 'organic' in map(lambda s: s.lower(), self.ds_config['database'].get('filters', [])):
            logger.info('Organic sum formula filter has been applied')
            return filter(lambda (_, sf): 'C' in self._sf_elements(sf), formula_list)
        return formula_list

    def find_sf_adduct_cand(self, formula_list, stored_sf_adduct):
        """
        Args
        ----
        formula_list : list
            List of pairs (id, sum formula) to search through
        stored_sf_adduct : set
            Set of (formula, adduct) pairs which have theoretical patterns saved in the database

        Returns
        -------
        : list
            List of (formula id, formula, adduct) triples which don't have theoretical patterns saved in the database
        """
        assert formula_list, 'Emtpy agg_formula table!'
        adducts = set(self.adducts) | set(DECOY_ADDUCTS)
        cand = [(id, sf, a) for (id, sf) in formula_list for a in adducts]
        return filter(lambda (sf_id, sf, adduct): (sf, adduct) not in stored_sf_adduct, cand)

    def generate_theor_peaks(self, sf_adduct_cand):
        """
        Args
        ----
        sf_adduct_cand : list
            List of (formula id, formula, adduct) triples which don't have theoretical patterns saved in the database

        Returns
        -------
        : list
            List of strings with formatted theoretical peaks data
        """
        logger.info('Generating missing peaks')
        formatted_iso_peaks = self.isocalc_wrapper.formatted_iso_peaks
        db_id = self.db.select_one(DB_ID_SEL, self.ds_config['database']['name'])[0]
        n = 10000
        for i in xrange(0, len(sf_adduct_cand), n):
            sf_adduct_cand_rdd = self.sc.parallelize(sf_adduct_cand[i:i+n], numSlices=128)
            peak_lines = (sf_adduct_cand_rdd
                          .flatMap(lambda (sf_id, sf, adduct): formatted_iso_peaks(db_id, sf_id, sf, adduct))
                          .collect())
            self._import_theor_peaks_to_db(peak_lines)

    def _import_theor_peaks_to_db(self, peak_lines):
        logger.info('Saving new peaks to the DB')
        if not exists(self.theor_peaks_tmp_dir):
            makedirs(self.theor_peaks_tmp_dir)

        peak_lines_path = join(self.theor_peaks_tmp_dir, 'peak_lines.csv')
        with open(peak_lines_path, 'w') as f:
            f.write('\n'.join(peak_lines))

        with open(peak_lines_path) as peaks_file:
            self.db.copy(peaks_file, 'theor_peaks')
 def _init_db(self):
     logger.info('Connecting to the DB')
     self._db = DB(self._sm_config['db'])
def test_search_job_imzml_example(get_compute_img_metrics_mock, filter_sf_metrics_mock,
                                  post_images_to_annot_service_mock, MolDBServiceWrapperMock, MolDBServiceWrapperMock2,
                                  sm_config, create_fill_sm_database, es_dsl_search, clean_isotope_storage):
    init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock)
    init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock2)

    get_compute_img_metrics_mock.return_value = lambda *args: (0.9, 0.9, 0.9, [100.], [0], [10.])
    filter_sf_metrics_mock.side_effect = lambda x: x

    url_dict = {
        'iso_image_ids': ['iso_image_1', None, None, None]
    }
    post_images_to_annot_service_mock.return_value = {
        35: url_dict,
        44: url_dict
    }

    db = DB(sm_config['db'])

    try:
        ds_config_str = open(ds_config_path).read()
        upload_dt = datetime.now()
        ds_id = '2000-01-01_00h00m'
        db.insert(Dataset.DS_INSERT, [{
            'id': ds_id,
            'name': test_ds_name,
            'input_path': input_dir_path,
            'upload_dt': upload_dt,
            'metadata': '{}',
            'config': ds_config_str,
            'status': DatasetStatus.QUEUED,
            'is_public': True,
            'mol_dbs': ['HMDB-v4'],
            'adducts': ['+H', '+Na', '+K'],
            'ion_img_storage': 'fs'
        }])

        img_store = ImageStoreServiceWrapper(sm_config['services']['img_service_url'])
        job = SearchJob(img_store=img_store)
        job._sm_config['rabbitmq'] = {}  # avoid talking to RabbitMQ during the test
        ds = Dataset.load(db, ds_id)
        job.run(ds)

        # dataset table asserts
        rows = db.select('SELECT id, name, input_path, upload_dt, status from dataset')
        input_path = join(dirname(__file__), 'data', test_ds_name)
        assert len(rows) == 1
        assert rows[0] == (ds_id, test_ds_name, input_path, upload_dt, DatasetStatus.FINISHED)

        # ms acquisition geometry asserts
        rows = db.select('SELECT acq_geometry from dataset')
        assert len(rows) == 1
        assert rows[0][0] == ds.get_acq_geometry(db)
        assert rows[0][0] == {
            ACQ_GEOMETRY_KEYS.LENGTH_UNIT: 'nm',
            ACQ_GEOMETRY_KEYS.AcqGridSection.section_name: {
                ACQ_GEOMETRY_KEYS.AcqGridSection.REGULAR_GRID: True,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_X : 3,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_Y : 3,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_X : 100,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_Y : 100
            },
            ACQ_GEOMETRY_KEYS.PixelSizeSection.section_name: {
                ACQ_GEOMETRY_KEYS.PixelSizeSection.REGULAR_SIZE: True,
                ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_X : 100,
                ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_Y : 100
            }
        }

        # job table asserts
        rows = db.select('SELECT db_id, ds_id, status, start, finish from job')
        assert len(rows) == 1
        db_id, ds_id, status, start, finish = rows[0]
        assert (db_id, ds_id, status) == (0, '2000-01-01_00h00m', 'FINISHED')
        assert start < finish

        # image metrics asserts
        rows = db.select(('SELECT db_id, sf, adduct, stats, iso_image_ids '
                          'FROM iso_image_metrics '
                          'ORDER BY sf, adduct'))

        assert rows[0] == (0, 'C12H24O', '+K', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9,
                                                'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]},
                           ['iso_image_1', None, None, None])
        assert rows[1] == (0, 'C12H24O', '+Na', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9,
                                                 'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]},
                           ['iso_image_1', None, None, None])

        time.sleep(1)  # Waiting for ES
        # ES asserts
        ds_docs = es_dsl_search.query('term', _type='dataset').execute().to_dict()['hits']['hits']
        assert 1 == len(ds_docs)
        ann_docs = es_dsl_search.query('term', _type='annotation').execute().to_dict()['hits']['hits']
        assert len(ann_docs) == len(rows)
        for doc in ann_docs:
            assert doc['_id'].startswith(ds_id)

    finally:
        db.close()
        with warn_only():
            local('rm -rf {}'.format(data_dir_path))
예제 #48
0
 def fin():
     db_config = dict(database='postgres', user='******', host='localhost', password='******')
     db = DB(db_config, autocommit=True)
     db.alter('DROP DATABASE IF EXISTS sm_test')
     db.close()