def del_diagnostics(ds_id: str, job_ids: Optional[List[int]] = None): db = DB() if job_ids is None: existing = db.select_with_fields( 'SELECT id, images FROM dataset_diagnostic dd WHERE dd.ds_id = %s', [ds_id], ) else: existing = db.select_with_fields( 'SELECT id, images FROM dataset_diagnostic dd ' 'WHERE dd.ds_id = %s AND dd.job_id = ANY(%s)', [ds_id, job_ids], ) if existing: # Delete existing images image_ids = [ img['image_id'] for row in existing for img in row['images'] or [] ] image_storage.delete_images(image_storage.DIAG, ds_id, image_ids) # Delete existing DB rows db.alter( 'DELETE FROM dataset_diagnostic WHERE id = ANY(%s::uuid[])', ([row['id'] for row in existing], ), )
def test_classify_ion_images_preds_saved(call_api_mock, image_storage_mock, fill_db): call_api_mock.return_value = { 'predictions': [{'prob': 0.1, 'label': 'on'}, {'prob': 0.9, 'label': 'off'}] } fp = io.BytesIO() Image.new('RGBA', (10, 10)).save(fp, format='PNG') fp.seek(0) img_bytes = fp.read() image_storage_mock.get_image.return_value = img_bytes db = DB() ds_id = '2000-01-01' ds = Dataset.load(db, ds_id) services_config = defaultdict(str) classify_dataset_ion_images(db, ds, services_config) annotations = db.select_with_fields( ( 'select off_sample ' 'from dataset d ' 'join job j on j.ds_id = d.id ' 'join annotation m on m.job_id = j.id ' 'where d.id = %s ' 'order by m.id ' ), params=(ds_id,), ) exp_annotations = [ {'off_sample': {'prob': 0.1, 'label': 'on'}}, {'off_sample': {'prob': 0.9, 'label': 'off'}}, ] assert annotations == exp_annotations
def test_create_moldb(fill_db, is_public): input_doc = moldb_input_doc( file_path=f's3://{BUCKET_NAME}/{MoldbFiles.VALID.value}', is_public=is_public) with patch_bottle_request(input_doc) as input_doc: resp = api.databases.create() assert resp['status'] == 'success' resp_doc = resp['data'] db = DB() doc = db.select_one_with_fields( 'SELECT id, name, version, group_id, is_public FROM molecular_db where id = %s', params=(resp_doc['id'], ), ) for field in ['name', 'version', 'group_id', 'is_public']: assert doc[field] == input_doc[field] docs = db.select_with_fields( 'SELECT * FROM molecule WHERE moldb_id = %s', params=(resp_doc['id'], ), ) for doc in docs: print(doc) for field in ['mol_id', 'mol_name', 'formula', 'inchi']: assert field in doc
def update_core_metabolome_database(): db = DB() rows = db.select_with_fields( "SELECT * FROM molecular_db WHERE name = 'core_metabolome_v3'") if rows: moldb = rows[0] logger.info(f'Updating molecular database: {moldb}') moldb['name'] = 'CoreMetabolome' moldb['version'] = 'v3' moldb['full_name'] = 'Core Metabolome Database' moldb[ 'description'] = 'METASPACE database of core mammalian metabolites and lipids' moldb['link'] = 'https://metaspace2020.eu' moldb['citation'] = ttdoc(tttext('In preparation')) moldb['group_id'] = None moldb['is_public'] = True db.alter( ("UPDATE molecular_db " "SET name = %s, version = %s, full_name = %s, description = %s," " link = %s, citation = %s, group_id = %s, is_public = %s " "WHERE id = %s;"), params=( moldb['name'], moldb['version'], moldb['full_name'], moldb['description'], moldb['link'], moldb['citation'], moldb['group_id'], moldb['is_public'], moldb['id'], ), ) rows = db.select_with_fields( "SELECT * FROM molecular_db WHERE name = 'CoreMetabolome'") if rows: logger.info(f'Updated database: {rows[0]}') else: logger.error(f'Did not find database "CoreMetabolome"')
def add_diagnostics(diagnostics: List[DatasetDiagnostic]): """Upserts dataset diagnostics, overwriting existing values with the same ds_id, job_id, type""" # Validate input, as postgres can't enforce the JSON columns have the correct schema, # and many places (graphql, python client, etc.) rely on these structures. if not diagnostics: return for diagnostic in diagnostics: assert 'ds_id' in diagnostic assert 'type' in diagnostic images = diagnostic.get('images', []) assert all(image['key'] in DiagnosticImageKey for image in images) assert all(image['format'] in DiagnosticImageFormat for image in images) assert all(image['image_id'] in image['url'] for image in images) image_keys = set( (image.get('key'), image.get('index')) for image in images) assert len(image_keys) == len( images), 'diagnostic image keys should be unique' db = DB() # Find all diagnostics that should be replaced by the new diagnostics existing = db.select_with_fields( """ WITH new_diagnostic AS ( SELECT UNNEST(%s::text[]) as ds_id, UNNEST(%s::int[]) as job_id, UNNEST(%s::text[]) as type ) SELECT dd.ds_id, dd.id, dd.images FROM new_diagnostic nd JOIN dataset_diagnostic dd ON nd.ds_id = dd.ds_id AND (nd.job_id = dd.job_id OR (nd.job_id IS NULL AND dd.job_id is NULL)) AND nd.type = dd.type """, list( map( list, zip(*((d['ds_id'], d.get('job_id'), d['type']) for d in diagnostics)))), ) if existing: logger.debug( f'Deleting {len(existing)} existing diagnostics for dataset {existing[0]["ds_id"]}' ) # Delete existing images image_ids_by_ds = defaultdict(list) for row in existing: for img in row['images'] or []: image_ids_by_ds[row['ds_id']].append(img['image_id']) for ds_id, image_ids in image_ids_by_ds.items(): image_storage.delete_images(image_storage.DIAG, ds_id, image_ids) # Delete existing DB rows db.alter( 'DELETE FROM dataset_diagnostic WHERE id = ANY(%s::uuid[])', ([row['id'] for row in existing], ), ) logger.debug( f'Inserting {len(diagnostics)} diagnostics for dataset {diagnostics[0]["ds_id"]}' ) db.insert( 'INSERT INTO dataset_diagnostic (ds_id, job_id, type, updated_dt, data, error, images) ' 'VALUES (%s, %s, %s, %s, %s, %s, %s)', [( d['ds_id'], d.get('job_id'), d['type'], datetime.now(), numpy_json_dumps(d['data']) if d.get('data') is not None else None, d.get('error'), numpy_json_dumps(d.get('images', [])), ) for d in diagnostics], )
class SciTester: def __init__(self, sm_config, analysis_version, database): reports_path = Path(proj_root()) / 'tests/reports' timestamp = datetime.now().replace(microsecond=0).isoformat().replace( ':', '-') suffix = f'{database}-v{analysis_version}' self.sm_config = sm_config self.db = DB() self.ds_id = '2000-01-01_00h00m01s' self.ref_results_path = reports_path / f'spheroid-{suffix}.csv' self.output_results_path = reports_path / f'test-{suffix}-{timestamp}.csv' self.ds_name = 'sci_test_spheroid_untreated' self.ds_data_path = join(self.sm_config['fs']['spark_data_path'], self.ds_name) self.moldb = MOL_DBS[database] self.analysis_version = analysis_version self.input_path = join(proj_root(), 'tests/data/untreated') self.ds_config_path = join(self.input_path, 'config.json') self.metrics = [ 'chaos', 'spatial', 'spectral', 'mz_err_abs', 'mz_err_rel', 'msm', 'fdr' ] self.comparison_df = None def fetch_search_res_df(self): query = ("SELECT m.formula, m.adduct, m.msm, m.fdr, m.stats " "FROM annotation m " "JOIN job j ON j.id = m.job_id " "WHERE j.ds_id = %s " "ORDER BY formula, adduct ") rows = self.db.select_with_fields(query, params=(self.ds_id, )) return pd.DataFrame([{ 'formula': r['formula'], 'adduct': r['adduct'], 'msm': r['msm'], 'fdr': r['fdr'], **r['stats'], } for r in rows]) def save_reference_results(self): results_df = self.fetch_search_res_df() cols = ['formula', 'adduct', *self.metrics] results_df[cols].to_csv(self.ref_results_path, index=False) print( f'Successfully saved reference search results to {self.ref_results_path}' ) def save_comparison_results(self): self.comparison_df.to_csv(self.output_results_path, index=False) @staticmethod def print_metric_hist(metric_vals): if 0.2 < np.max(metric_vals) - np.min(metric_vals) <= 3.0: # For metrics in the range -1.0 to 1.0, aligned bins of 0.1 are easier to read min_edge = np.floor(np.min(metric_vals) * 10) / 10 max_edge = np.ceil(np.max(metric_vals) * 10) / 10 n_bins = int(np.round((max_edge - min_edge) * 10)) else: # Otherwise use unaligned bins min_edge = np.min(metric_vals) max_edge = np.max(metric_vals) n_bins = 10 bins = np.linspace(min_edge, max_edge, n_bins + 1) metric_freq, metric_interv = np.histogram(metric_vals, bins=bins) for lo, hi, freq in zip(metric_interv[:-1], metric_interv[1:], metric_freq): print(f'{lo:f}-{hi:f}: {freq}') def print_differences(self): df = self.comparison_df missing_df = df[df.matching == 'ref_only'] unexpected_df = df[df.matching == 'new_only'] common_df = df[df.matching == ''] n_ref = df.matching.isin({'ref_only', ''}).count() n_new = df.matching.isin({'new_only', ''}).count() print( f'MISSED FORMULAS: {len(missing_df)} ({len(missing_df) * 100 / n_ref:.1f}%)' ) print( f'FALSE DISCOVERY: {len(unexpected_df)} ({len(unexpected_df) * 100 / n_new:.1f}%)' ) differing_metrics = [ metric for metric in self.metrics if common_df[f'{metric}_differs'].any() ] if differing_metrics: for metric in differing_metrics: print(f'{metric}_new - {metric}_ref histogram: ') self.print_metric_hist(common_df[f'{metric}_new'] - common_df[f'{metric}_ref']) print() else: print('All metrics equal in common annotations') def fdr_differs(self, fdr_ref, fdr_new): if self.analysis_version == 1: # FDRs are quantized - allow them to jump up/down one level levels = [0.0501, 0.1001, 0.2001, 0.5001] ref_level = next( (i for i, level in enumerate(levels) if fdr_ref < level), len(levels)) new_level = next( (i for i, level in enumerate(levels) if fdr_new < level), len(levels)) return abs(ref_level - new_level) > 1 else: # Allow +/- 10% relative difference OR +/- 5% FDR absolute difference to compensate for # possible differences if the decoys are sampled differently. return not np.isclose(fdr_ref, fdr_new, rtol=0.1, atol=0.05) def make_comparison_df(self): ref_results = pd.read_csv(self.ref_results_path) new_results = self.fetch_search_res_df() df = ref_results.merge( new_results, on=['formula', 'adduct'], how='outer', suffixes=('_ref', '_new'), indicator='matching', ) df['matching'] = df.matching.cat.rename_categories({ 'left_only': 'ref_only', 'right_only': 'new_only', 'both': '' }) # Interleave columns for easy side-by-side comparison cols = ['formula', 'adduct', 'matching'] for col in self.metrics: cols.append(f'{col}_ref') cols.append(f'{col}_new') df = df[cols] # Add "differs" fields indicating whether the values have changed enough to be considered # different from the originals. for col in self.metrics: if col == 'fdr': df[f'fdr_differs'] = [ self.fdr_differs(row.fdr_ref, row.fdr_new) for row in df[['fdr_ref', 'fdr_new']].itertuples() ] else: df[f'{col}_differs'] = ~np.isclose(df[f'{col}_ref'], df[f'{col}_new']) self.comparison_df = df def search_results_are_different(self): annotations_mismatch = (self.comparison_df.matching != '').any() metrics_differ = any(self.comparison_df[f'{metric}_differs'].any() for metric in self.metrics) return annotations_mismatch or metrics_differ @classmethod def _patch_image_storage(cls): class ImageStorageMock: ISO = image_storage.ISO def __init__(self, *args, **kwargs): pass def post_image(self, *args, **kwargs): pass from sm.engine.annotation_spark import search_results search_results.ImageStorage = ImageStorageMock def run_search(self, store_images=False, use_lithops=False): if not store_images: self._patch_image_storage() moldb_id = molecular_db.find_by_name_version(self.moldb['name'], self.moldb['version']).id os.environ['PYSPARK_PYTHON'] = sys.executable ds = create_ds_from_files(self.ds_id, self.ds_name, self.input_path) ds.config['analysis_version'] = self.analysis_version ds.config['fdr'][ 'scoring_model'] = 'v3_default' if self.analysis_version > 1 else None ds.config['database_ids'] = [moldb_id] self.db.alter('DELETE FROM job WHERE ds_id=%s', params=(ds.id, )) ds.save(self.db, allow_insert=True) perf = NullProfiler() if use_lithops: # Override the runtime to force it to run without docker. lithops_executor.RUNTIME_CF_VPC = 'python' lithops_executor.RUNTIME_CE = 'python' executor = Executor(self.sm_config['lithops'], perf) job = ServerAnnotationJob( executor, ds, perf, self.sm_config, store_images=store_images, ) job.run(debug_validate=True) else: AnnotationJob(ds, perf).run() self.make_comparison_df() def clear_data_dirs(self): path = Path(self.ds_data_path) if path.exists(): path.rmdir()
def run_coloc_jobs( sm_config, ds_id_str, sql_where, fix_missing, fix_corrupt, skip_existing, use_lithops ): assert ( len( [ data_source for data_source in [ds_id_str, sql_where, fix_missing, fix_corrupt] if data_source ] ) == 1 ), "Exactly one data source (ds_id, sql_where, fix_missing, fix_corrupt) must be specified" assert not (ds_id_str and sql_where) db = DB() if ds_id_str: ds_ids = ds_id_str.split(',') elif sql_where: ds_ids = [ id for (id,) in db.select(f'SELECT DISTINCT dataset.id FROM dataset WHERE {sql_where}') ] else: mol_dbs = [ (doc['id'], doc['name']) for doc in db.select_with_fields('SELECT id, name FROM molecular_db m') ] mol_db_ids, mol_db_names = map(list, zip(*mol_dbs)) fdrs = [0.05, 0.1, 0.2, 0.5] algorithms = ['median_thresholded_cosine', 'cosine'] if fix_missing: logger.info('Checking for missing colocalization jobs...') results = db.select( MISSING_COLOC_JOBS_SEL, [list(mol_db_ids), list(mol_db_names), fdrs, algorithms] ) ds_ids = [ds_id for ds_id, in results] logger.info(f'Found {len(ds_ids)} missing colocalization sets') else: logger.info( 'Checking all colocalization jobs. ' 'This is super slow: ~5 minutes per 1000 datasets...' ) results = db.select( CORRUPT_COLOC_JOBS_SEL, [list(mol_db_ids), list(mol_db_names), fdrs, algorithms] ) ds_ids = [ds_id for ds_id, in results] logger.info(f'Found {len(ds_ids)} corrupt colocalization sets') if not ds_ids: logger.warning('No datasets match filter') return if use_lithops: executor = Executor(sm_config['lithops']) for i, ds_id in enumerate(ds_ids): try: logger.info(f'Running colocalization on {i+1} out of {len(ds_ids)}') ds = Dataset.load(db, ds_id) coloc = Colocalization(db) if use_lithops: # noinspection PyUnboundLocalVariable coloc.run_coloc_job_lithops(executor, ds, reprocess=not skip_existing) else: coloc.run_coloc_job(ds, reprocess=not skip_existing) except Exception: logger.error(f'Failed to run colocalization on {ds_id}', exc_info=True)