예제 #1
0
def del_diagnostics(ds_id: str, job_ids: Optional[List[int]] = None):
    db = DB()
    if job_ids is None:
        existing = db.select_with_fields(
            'SELECT id, images FROM dataset_diagnostic dd WHERE dd.ds_id = %s',
            [ds_id],
        )
    else:
        existing = db.select_with_fields(
            'SELECT id, images FROM dataset_diagnostic dd '
            'WHERE dd.ds_id = %s AND dd.job_id = ANY(%s)',
            [ds_id, job_ids],
        )

    if existing:
        # Delete existing images
        image_ids = [
            img['image_id'] for row in existing for img in row['images'] or []
        ]
        image_storage.delete_images(image_storage.DIAG, ds_id, image_ids)

        # Delete existing DB rows
        db.alter(
            'DELETE FROM dataset_diagnostic WHERE id = ANY(%s::uuid[])',
            ([row['id'] for row in existing], ),
        )
예제 #2
0
def test_classify_ion_images_preds_saved(call_api_mock, image_storage_mock, fill_db):
    call_api_mock.return_value = {
        'predictions': [{'prob': 0.1, 'label': 'on'}, {'prob': 0.9, 'label': 'off'}]
    }

    fp = io.BytesIO()
    Image.new('RGBA', (10, 10)).save(fp, format='PNG')
    fp.seek(0)
    img_bytes = fp.read()
    image_storage_mock.get_image.return_value = img_bytes

    db = DB()
    ds_id = '2000-01-01'
    ds = Dataset.load(db, ds_id)

    services_config = defaultdict(str)
    classify_dataset_ion_images(db, ds, services_config)

    annotations = db.select_with_fields(
        (
            'select off_sample '
            'from dataset d '
            'join job j on j.ds_id = d.id '
            'join annotation m on m.job_id = j.id '
            'where d.id = %s '
            'order by m.id '
        ),
        params=(ds_id,),
    )
    exp_annotations = [
        {'off_sample': {'prob': 0.1, 'label': 'on'}},
        {'off_sample': {'prob': 0.9, 'label': 'off'}},
    ]
    assert annotations == exp_annotations
예제 #3
0
def test_create_moldb(fill_db, is_public):
    input_doc = moldb_input_doc(
        file_path=f's3://{BUCKET_NAME}/{MoldbFiles.VALID.value}',
        is_public=is_public)
    with patch_bottle_request(input_doc) as input_doc:

        resp = api.databases.create()

        assert resp['status'] == 'success'
        resp_doc = resp['data']

        db = DB()
        doc = db.select_one_with_fields(
            'SELECT id, name, version, group_id, is_public FROM molecular_db where id = %s',
            params=(resp_doc['id'], ),
        )
        for field in ['name', 'version', 'group_id', 'is_public']:
            assert doc[field] == input_doc[field]

        docs = db.select_with_fields(
            'SELECT * FROM molecule WHERE moldb_id = %s',
            params=(resp_doc['id'], ),
        )
        for doc in docs:
            print(doc)
            for field in ['mol_id', 'mol_name', 'formula', 'inchi']:
                assert field in doc
예제 #4
0
def update_core_metabolome_database():
    db = DB()
    rows = db.select_with_fields(
        "SELECT * FROM molecular_db WHERE name = 'core_metabolome_v3'")
    if rows:
        moldb = rows[0]

        logger.info(f'Updating molecular database: {moldb}')

        moldb['name'] = 'CoreMetabolome'
        moldb['version'] = 'v3'
        moldb['full_name'] = 'Core Metabolome Database'
        moldb[
            'description'] = 'METASPACE database of core mammalian metabolites and lipids'
        moldb['link'] = 'https://metaspace2020.eu'
        moldb['citation'] = ttdoc(tttext('In preparation'))
        moldb['group_id'] = None
        moldb['is_public'] = True

        db.alter(
            ("UPDATE molecular_db "
             "SET name = %s, version = %s, full_name = %s, description = %s,"
             "    link = %s, citation = %s, group_id = %s, is_public = %s "
             "WHERE id = %s;"),
            params=(
                moldb['name'],
                moldb['version'],
                moldb['full_name'],
                moldb['description'],
                moldb['link'],
                moldb['citation'],
                moldb['group_id'],
                moldb['is_public'],
                moldb['id'],
            ),
        )

    rows = db.select_with_fields(
        "SELECT * FROM molecular_db WHERE name = 'CoreMetabolome'")
    if rows:
        logger.info(f'Updated database: {rows[0]}')
    else:
        logger.error(f'Did not find database "CoreMetabolome"')
예제 #5
0
def add_diagnostics(diagnostics: List[DatasetDiagnostic]):
    """Upserts dataset diagnostics, overwriting existing values with the same ds_id, job_id, type"""
    # Validate input, as postgres can't enforce the JSON columns have the correct schema,
    # and many places (graphql, python client, etc.) rely on these structures.

    if not diagnostics:
        return

    for diagnostic in diagnostics:
        assert 'ds_id' in diagnostic
        assert 'type' in diagnostic
        images = diagnostic.get('images', [])
        assert all(image['key'] in DiagnosticImageKey for image in images)
        assert all(image['format'] in DiagnosticImageFormat
                   for image in images)
        assert all(image['image_id'] in image['url'] for image in images)
        image_keys = set(
            (image.get('key'), image.get('index')) for image in images)
        assert len(image_keys) == len(
            images), 'diagnostic image keys should be unique'

    db = DB()
    # Find all diagnostics that should be replaced by the new diagnostics
    existing = db.select_with_fields(
        """
        WITH new_diagnostic AS (
            SELECT UNNEST(%s::text[]) as ds_id, UNNEST(%s::int[]) as job_id,
            UNNEST(%s::text[]) as type
        )
        SELECT dd.ds_id, dd.id, dd.images
        FROM new_diagnostic nd
        JOIN dataset_diagnostic dd ON nd.ds_id = dd.ds_id
            AND (nd.job_id = dd.job_id OR (nd.job_id IS NULL AND dd.job_id is NULL))
            AND nd.type = dd.type
        """,
        list(
            map(
                list,
                zip(*((d['ds_id'], d.get('job_id'), d['type'])
                      for d in diagnostics)))),
    )

    if existing:
        logger.debug(
            f'Deleting {len(existing)} existing diagnostics for dataset {existing[0]["ds_id"]}'
        )
        # Delete existing images
        image_ids_by_ds = defaultdict(list)
        for row in existing:
            for img in row['images'] or []:
                image_ids_by_ds[row['ds_id']].append(img['image_id'])
        for ds_id, image_ids in image_ids_by_ds.items():
            image_storage.delete_images(image_storage.DIAG, ds_id, image_ids)

        # Delete existing DB rows
        db.alter(
            'DELETE FROM dataset_diagnostic WHERE id = ANY(%s::uuid[])',
            ([row['id'] for row in existing], ),
        )

    logger.debug(
        f'Inserting {len(diagnostics)} diagnostics for dataset {diagnostics[0]["ds_id"]}'
    )
    db.insert(
        'INSERT INTO dataset_diagnostic (ds_id, job_id, type, updated_dt, data, error, images) '
        'VALUES (%s, %s, %s, %s, %s, %s, %s)',
        [(
            d['ds_id'],
            d.get('job_id'),
            d['type'],
            datetime.now(),
            numpy_json_dumps(d['data']) if d.get('data') is not None else None,
            d.get('error'),
            numpy_json_dumps(d.get('images', [])),
        ) for d in diagnostics],
    )
예제 #6
0
class SciTester:
    def __init__(self, sm_config, analysis_version, database):
        reports_path = Path(proj_root()) / 'tests/reports'
        timestamp = datetime.now().replace(microsecond=0).isoformat().replace(
            ':', '-')
        suffix = f'{database}-v{analysis_version}'

        self.sm_config = sm_config
        self.db = DB()

        self.ds_id = '2000-01-01_00h00m01s'
        self.ref_results_path = reports_path / f'spheroid-{suffix}.csv'
        self.output_results_path = reports_path / f'test-{suffix}-{timestamp}.csv'

        self.ds_name = 'sci_test_spheroid_untreated'
        self.ds_data_path = join(self.sm_config['fs']['spark_data_path'],
                                 self.ds_name)
        self.moldb = MOL_DBS[database]
        self.analysis_version = analysis_version
        self.input_path = join(proj_root(), 'tests/data/untreated')
        self.ds_config_path = join(self.input_path, 'config.json')
        self.metrics = [
            'chaos', 'spatial', 'spectral', 'mz_err_abs', 'mz_err_rel', 'msm',
            'fdr'
        ]

        self.comparison_df = None

    def fetch_search_res_df(self):
        query = ("SELECT m.formula, m.adduct, m.msm, m.fdr, m.stats "
                 "FROM annotation m "
                 "JOIN job j ON j.id = m.job_id "
                 "WHERE j.ds_id = %s "
                 "ORDER BY formula, adduct ")

        rows = self.db.select_with_fields(query, params=(self.ds_id, ))
        return pd.DataFrame([{
            'formula': r['formula'],
            'adduct': r['adduct'],
            'msm': r['msm'],
            'fdr': r['fdr'],
            **r['stats'],
        } for r in rows])

    def save_reference_results(self):
        results_df = self.fetch_search_res_df()

        cols = ['formula', 'adduct', *self.metrics]
        results_df[cols].to_csv(self.ref_results_path, index=False)

        print(
            f'Successfully saved reference search results to {self.ref_results_path}'
        )

    def save_comparison_results(self):
        self.comparison_df.to_csv(self.output_results_path, index=False)

    @staticmethod
    def print_metric_hist(metric_vals):
        if 0.2 < np.max(metric_vals) - np.min(metric_vals) <= 3.0:
            # For metrics in the range -1.0 to 1.0, aligned bins of 0.1 are easier to read
            min_edge = np.floor(np.min(metric_vals) * 10) / 10
            max_edge = np.ceil(np.max(metric_vals) * 10) / 10
            n_bins = int(np.round((max_edge - min_edge) * 10))
        else:
            # Otherwise use unaligned bins
            min_edge = np.min(metric_vals)
            max_edge = np.max(metric_vals)
            n_bins = 10
        bins = np.linspace(min_edge, max_edge, n_bins + 1)
        metric_freq, metric_interv = np.histogram(metric_vals, bins=bins)

        for lo, hi, freq in zip(metric_interv[:-1], metric_interv[1:],
                                metric_freq):
            print(f'{lo:f}-{hi:f}: {freq}')

    def print_differences(self):
        df = self.comparison_df
        missing_df = df[df.matching == 'ref_only']
        unexpected_df = df[df.matching == 'new_only']
        common_df = df[df.matching == '']
        n_ref = df.matching.isin({'ref_only', ''}).count()
        n_new = df.matching.isin({'new_only', ''}).count()

        print(
            f'MISSED FORMULAS: {len(missing_df)} ({len(missing_df) * 100 / n_ref:.1f}%)'
        )
        print(
            f'FALSE DISCOVERY: {len(unexpected_df)} ({len(unexpected_df) * 100 / n_new:.1f}%)'
        )

        differing_metrics = [
            metric for metric in self.metrics
            if common_df[f'{metric}_differs'].any()
        ]
        if differing_metrics:
            for metric in differing_metrics:
                print(f'{metric}_new - {metric}_ref histogram: ')
                self.print_metric_hist(common_df[f'{metric}_new'] -
                                       common_df[f'{metric}_ref'])
                print()
        else:
            print('All metrics equal in common annotations')

    def fdr_differs(self, fdr_ref, fdr_new):
        if self.analysis_version == 1:
            # FDRs are quantized - allow them to jump up/down one level
            levels = [0.0501, 0.1001, 0.2001, 0.5001]
            ref_level = next(
                (i for i, level in enumerate(levels) if fdr_ref < level),
                len(levels))
            new_level = next(
                (i for i, level in enumerate(levels) if fdr_new < level),
                len(levels))
            return abs(ref_level - new_level) > 1
        else:
            # Allow +/- 10% relative difference OR +/- 5% FDR absolute difference to compensate for
            # possible differences if the decoys are sampled differently.
            return not np.isclose(fdr_ref, fdr_new, rtol=0.1, atol=0.05)

    def make_comparison_df(self):
        ref_results = pd.read_csv(self.ref_results_path)
        new_results = self.fetch_search_res_df()

        df = ref_results.merge(
            new_results,
            on=['formula', 'adduct'],
            how='outer',
            suffixes=('_ref', '_new'),
            indicator='matching',
        )
        df['matching'] = df.matching.cat.rename_categories({
            'left_only': 'ref_only',
            'right_only': 'new_only',
            'both': ''
        })

        # Interleave columns for easy side-by-side comparison
        cols = ['formula', 'adduct', 'matching']
        for col in self.metrics:
            cols.append(f'{col}_ref')
            cols.append(f'{col}_new')
        df = df[cols]

        # Add "differs" fields indicating whether the values have changed enough to be considered
        # different from the originals.
        for col in self.metrics:
            if col == 'fdr':
                df[f'fdr_differs'] = [
                    self.fdr_differs(row.fdr_ref, row.fdr_new)
                    for row in df[['fdr_ref', 'fdr_new']].itertuples()
                ]
            else:
                df[f'{col}_differs'] = ~np.isclose(df[f'{col}_ref'],
                                                   df[f'{col}_new'])

        self.comparison_df = df

    def search_results_are_different(self):
        annotations_mismatch = (self.comparison_df.matching != '').any()
        metrics_differ = any(self.comparison_df[f'{metric}_differs'].any()
                             for metric in self.metrics)
        return annotations_mismatch or metrics_differ

    @classmethod
    def _patch_image_storage(cls):
        class ImageStorageMock:
            ISO = image_storage.ISO

            def __init__(self, *args, **kwargs):
                pass

            def post_image(self, *args, **kwargs):
                pass

        from sm.engine.annotation_spark import search_results

        search_results.ImageStorage = ImageStorageMock

    def run_search(self, store_images=False, use_lithops=False):
        if not store_images:
            self._patch_image_storage()

        moldb_id = molecular_db.find_by_name_version(self.moldb['name'],
                                                     self.moldb['version']).id

        os.environ['PYSPARK_PYTHON'] = sys.executable

        ds = create_ds_from_files(self.ds_id, self.ds_name, self.input_path)
        ds.config['analysis_version'] = self.analysis_version
        ds.config['fdr'][
            'scoring_model'] = 'v3_default' if self.analysis_version > 1 else None
        ds.config['database_ids'] = [moldb_id]

        self.db.alter('DELETE FROM job WHERE ds_id=%s', params=(ds.id, ))
        ds.save(self.db, allow_insert=True)
        perf = NullProfiler()
        if use_lithops:
            # Override the runtime to force it to run without docker.
            lithops_executor.RUNTIME_CF_VPC = 'python'
            lithops_executor.RUNTIME_CE = 'python'

            executor = Executor(self.sm_config['lithops'], perf)
            job = ServerAnnotationJob(
                executor,
                ds,
                perf,
                self.sm_config,
                store_images=store_images,
            )
            job.run(debug_validate=True)
        else:
            AnnotationJob(ds, perf).run()

        self.make_comparison_df()

    def clear_data_dirs(self):
        path = Path(self.ds_data_path)
        if path.exists():
            path.rmdir()
예제 #7
0
def run_coloc_jobs(
    sm_config, ds_id_str, sql_where, fix_missing, fix_corrupt, skip_existing, use_lithops
):
    assert (
        len(
            [
                data_source
                for data_source in [ds_id_str, sql_where, fix_missing, fix_corrupt]
                if data_source
            ]
        )
        == 1
    ), "Exactly one data source (ds_id, sql_where, fix_missing, fix_corrupt) must be specified"
    assert not (ds_id_str and sql_where)

    db = DB()

    if ds_id_str:
        ds_ids = ds_id_str.split(',')
    elif sql_where:
        ds_ids = [
            id for (id,) in db.select(f'SELECT DISTINCT dataset.id FROM dataset WHERE {sql_where}')
        ]
    else:
        mol_dbs = [
            (doc['id'], doc['name'])
            for doc in db.select_with_fields('SELECT id, name FROM molecular_db m')
        ]
        mol_db_ids, mol_db_names = map(list, zip(*mol_dbs))
        fdrs = [0.05, 0.1, 0.2, 0.5]
        algorithms = ['median_thresholded_cosine', 'cosine']

        if fix_missing:
            logger.info('Checking for missing colocalization jobs...')
            results = db.select(
                MISSING_COLOC_JOBS_SEL, [list(mol_db_ids), list(mol_db_names), fdrs, algorithms]
            )
            ds_ids = [ds_id for ds_id, in results]
            logger.info(f'Found {len(ds_ids)} missing colocalization sets')
        else:
            logger.info(
                'Checking all colocalization jobs. '
                'This is super slow: ~5 minutes per 1000 datasets...'
            )
            results = db.select(
                CORRUPT_COLOC_JOBS_SEL, [list(mol_db_ids), list(mol_db_names), fdrs, algorithms]
            )
            ds_ids = [ds_id for ds_id, in results]
            logger.info(f'Found {len(ds_ids)} corrupt colocalization sets')

    if not ds_ids:
        logger.warning('No datasets match filter')
        return

    if use_lithops:
        executor = Executor(sm_config['lithops'])

    for i, ds_id in enumerate(ds_ids):
        try:
            logger.info(f'Running colocalization on {i+1} out of {len(ds_ids)}')
            ds = Dataset.load(db, ds_id)
            coloc = Colocalization(db)
            if use_lithops:
                # noinspection PyUnboundLocalVariable
                coloc.run_coloc_job_lithops(executor, ds, reprocess=not skip_existing)
            else:
                coloc.run_coloc_job(ds, reprocess=not skip_existing)
        except Exception:
            logger.error(f'Failed to run colocalization on {ds_id}', exc_info=True)