def reporting_loop(ori_ds_id, db_id, msms_ds_id, out_path, parent_and_fragment_req=True, fdr_max=0.5, save_image=True): # Access server and logon! sm = SMInstance(host='https://beta.metaspace2020.eu') sm = logon_metaspace(sm) # Accesses results with target db and parses ds = sm.dataset(id=msms_ds_id) results_df = ds.results(database=db_id).reset_index() results_df = extract_results_metaspace(msms_ds_id, results_df) if parent_and_fragment_req == True: results_df = results_df[(results_df.parent_y == 1) & (results_df.n_frag_y > 0)] pathlib.Path(out_path + msms_ds_id + '/').mkdir(parents=True, exist_ok=True) out_df = out_path + msms_ds_id + '/' + "ms2_" + msms_ds_id + "_db_" + db_id + "_ms1_" + ori_ds_id + '.pickle' results_df.to_pickle(out_df) # Loop to download images from METASPACE for datasets img_dict = {} img_dict[out_df] = dl_img(ds, msms_ds_id, db_id, fdr_max, out_path + msms_ds_id + '/by_formula/', save_image) # Loop to group ion images or arrays by formula into by parent id copy_by_parent(img_dict, msms_ds_id, out_df, out_path, save_image) return
def get_reference_results(metaspace_options, ds_id): from metaspace.sm_annotation_utils import SMInstance if metaspace_options.get('host'): sm = SMInstance(host=metaspace_options['host']) else: sm = SMInstance() if metaspace_options.get('password'): sm.login(metaspace_options['email'], metaspace_options['password']) ds = sm.dataset(id=ds_id) reference_results = (ds.results('HMDB-v4') .reset_index() .rename({'moc': 'chaos', 'rhoSpatial': 'spatial', 'rhoSpectral': 'spectral'}, axis=1)) return reference_results[['formula', 'adduct', 'chaos', 'spatial', 'spectral', 'msm', 'fdr']]
def get_ds_data(ds_id, fdr): sm = SMInstance() anns = sm._gqclient.getAnnotations( { 'database': 'HMDB-v4', 'fdrLevel': fdr, 'hasNeutralLoss': False, 'hasChemMod': False, 'hasHiddenAdduct': False }, {'ids': ds_id}) if len(anns) > 2: coloc = get_coloc_matrix(anns) mzs = np.array([ann['mz'] for ann in anns]) mz_range = np.min(mzs), np.max(mzs) mz_dict = dict([(ann['sumFormula'] + ann['adduct'], ann['mz']) for ann in anns]) return coloc, mz_range, mz_dict return None
'hasChemMod': False, 'hasHiddenAdduct': False }, {'ids': ds_id}) if len(anns) > 2: coloc = get_coloc_matrix(anns) mzs = np.array([ann['mz'] for ann in anns]) mz_range = np.min(mzs), np.max(mzs) mz_dict = dict([(ann['sumFormula'] + ann['adduct'], ann['mz']) for ann in anns]) return coloc, mz_range, mz_dict return None # test_coloc, test_mz_range, test_mz_dict = get_ds_data('2019-08-24_17h34m28s') #%% Process, Concat, Save colocalizations sm = SMInstance() # sm.login(**json.loads(open('/home/lachlan/.metaspace.json'))) @filecache(mru=0) def fetch_data_from_metaspace(pol, fdr): all_coloc = [] all_ranges = [] all_mz_dict = {} ion_present_in_ds = set() datasets = [ SMDataset(info, sm._gqclient) for info in sm._gqclient.getDatasets( {'polarity': 'POSITIVE' if pol == POS else 'NEGATIVE'}) ] ds_ids = [] with ProcessPoolExecutor(8) as ex:
It looks like the python client can't handle cases where a candidate molecule has a None url. As an interim workaround, I'd suggest making a local copy of the results() method that you're using from here: https://github.com/metaspace2020/metaspace/blob/master/metaspace/python-client/metaspace/sm_annotation_utils.py#L666-L711 Four changes are needed: * Remove the self argument on line 666 so that you can call it from outside of a class * Change records = self._gqclient.getAnnotations( to sm._gqclient.getAnnotations( on line 681 to remove the dependency on the SMDataset class * Change self.id to the dataset id on line 683 * Remove the moleculeIds= assignment on line 695 - that's where the bug is. ''' import pandas as pd from metaspace.sm_annotation_utils import SMInstance #sm = SMInstance() sm = SMInstance(host='https://beta.metaspace2020.eu') def results(dsid_used, database, fdr=None, coloc_with=None): if coloc_with: assert fdr coloc_coeff_filter = { 'database': database, 'colocalizedWith': coloc_with, 'fdrLevel': fdr, } annotation_filter = coloc_coeff_filter.copy() else: coloc_coeff_filter = None annotation_filter = {'database': database, 'hasHiddenAdduct': True }
to fix the code here to keep track of them. """ import logging from copy import deepcopy from pathlib import Path from typing import Any, Tuple from metaspace.sm_annotation_utils import SMInstance from sm.engine.ds_config import DSConfig from sm.fdr_engineering.rerun_datasets import reprocess_dataset_remote, wait_for_datasets # GlobalInit() Only needed for the "_local" functions logger = logging.getLogger(__name__) sm_src = SMInstance() sm_dst = SMInstance(config_path=str(Path.home() / '.metaspace.local')) DST_SUFFIX = '_ml_training' core_metabolome_dst_id = next( db.id for db in sm_dst.databases() if db.name == 'CoreMetabolome' and db.version == 'v3') data_dir = Path( 'local/ml_scoring').resolve() # the "local" subdirectory is .gitignored data_dir.parent.mkdir(parents=True, exist_ok=True) dataset_ids_file = data_dir / 'dataset_ids.txt' dataset_ids = [ds_id.strip() for ds_id in dataset_ids_file.open().readlines()] dst_dataset_ids = [ds_id + DST_SUFFIX for ds_id in dataset_ids] #%% # # Test VPC
'mz_err_rel_abserr', # _fdr suffix applies the FDR transformation 'chaos_fdr', 'spatial_fdr', 'spectral_fdr', 'mz_err_abs_fdr', 'mz_err_rel_fdr', ] #%% Download the data or load it from a local cache file downloaded_data_file = data_dir / 'metrics_df_fdr20.parquet' FORCE_REDOWNLOAD = False if downloaded_data_file.exists() and not FORCE_REDOWNLOAD: metrics_df = pd.read_parquet(downloaded_data_file) logger.info(f'Loaded {downloaded_data_file}') else: sm_dst = SMInstance(config_path=str(Path.home() / '.metaspace.local')) # ds_diags is an iterable to save temp memory ds_diags = get_many_fdr_diagnostics_remote(sm_dst, dst_dataset_ids) metrics_df = get_ranking_data(ds_diags, all_features) metrics_df.to_parquet(downloaded_data_file) #%% Recalculate FDR fields def calc_fdr_fields(df): target = df.target == 1.0 target_df = df[target].copy() decoy_df = df[~target].copy() # FIXME: Remove hard-coded value 20 - should be decoy_sample_size decoy_sample_size = 20 / df[df.target == 1].modifier.nunique() add_derived_features(target_df, decoy_df, decoy_sample_size, all_features)
'hasChemMod': False, 'hasHiddenAdduct': False }, {'ids': ds_id}) if len(anns) > 2: coloc = get_coloc_matrix(anns) mzs = np.array([ann['mz'] for ann in anns]) mz_range = np.min(mzs), np.max(mzs) mz_dict = dict([(ann['sumFormula'] + ann['adduct'], ann['mz']) for ann in anns]) return coloc, mz_range, mz_dict return None # test_coloc, test_mz_range, test_mz_dict = get_ds_data('2019-08-24_17h34m28s') #%% Process, Concat, Save colocalizations sm = SMInstance() # sm.login(**json.loads(open('/home/lachlan/.metaspace.json'))) datasets = sm.datasets() def fetch_data_from_metaspace(is_pos, coloc_filename, data_filename): all_coloc = [] all_ranges = [] all_mz_dict = {} ion_present_in_ds = set() datasets = [ SMDataset(info, sm._gqclient) for info in sm._gqclient.getDatasets( {'polarity': 'POSITIVE' if is_pos else 'NEGATIVE'}) ] ds_ids = []
MATRIX_RE = re.compile('|'.join( re.sub('[()]', '\\$0', k) for k in MATRIX_MAPPING.keys())) def normalize_matrix(matrix): if not matrix: return 'Other' m = MATRIX_RE.search(matrix.lower()) if m: return MATRIX_MAPPING[m[0]] return 'Other' #%% sm = SMInstance( ) # Call sm.save_login() to save a credentials file to access private DSs all_datasets = sm.datasets(status='FINISHED') all_ds_df = pd.DataFrame({ 'ds_id': ds.id, 'name': ds.name, 'group': ds.group['shortName'] if ds.group else 'None', 'submitter': ds.submitter['name'], # pylint: disable=protected-access 'is_public': ds._info['isPublic'], 'polarity':
def sm(): return SMInstance(config_path=(Path(__file__).parent / '../../test_config').resolve())
def reprocess_dataset_remote( sm_src: SMInstance, sm_dst: SMInstance, src_ds_id: str, dst_ds_id: str, update_metadata_func, skip_existing=True, ): try: dst_ds = sm_dst.dataset(id=dst_ds_id) assert dst_ds.status == 'FINISHED' assert any(diag['type'] == DiagnosticType.FDR_RESULTS for diag in dst_ds.diagnostics(False)) existing = True except Exception: existing = False if skip_existing and existing: print(f'Skipping {dst_ds_id}\n', end=None) return dst_ds_id, None smds = sm_src.dataset(id=src_ds_id) ds_metadata, ds_config = update_metadata_func(smds.metadata, smds.config) # pylint: disable=protected-access # There's no other clean way to get _gqclient gqclient_dst = sm_dst._gqclient graphql_response = gqclient_dst.create_dataset( { 'name': smds.name, 'inputPath': smds.s3dir, 'metadataJson': json.dumps(ds_metadata), 'databaseIds': ds_config['database_ids'], 'adducts': ds_config['isotope_generation']['adducts'], 'neutralLosses': ds_config['isotope_generation']['neutral_losses'], 'chemMods': ds_config['isotope_generation']['chem_mods'], 'ppm': ds_config['image_generation']['ppm'], 'numPeaks': ds_config['isotope_generation']['n_peaks'], 'decoySampleSize': ds_config['fdr']['decoy_sample_size'], 'analysisVersion': ds_config['analysis_version'], 'submitterId': sm_dst.current_user_id(), 'groupId': gqclient_dst.get_primary_group_id(), # 'projectIds': project_ids, 'isPublic': False, 'scoringModel': ds_config['fdr'].get('scoring_model'), 'computeUnusedMetrics': ds_config['image_generation']['compute_unused_metrics'], }, ds_id=dst_ds_id, # Requires admin account ) return json.loads(graphql_response)['datasetId']
def post(self, study_id): log_request(request) # param validation if study_id is None: abort(404, 'Please provide valid parameter for study identifier') study_id = study_id.upper() # User authentication user_token = None if "user_token" in request.headers: user_token = request.headers["user_token"] # check for access rights is_curator, read_access, write_access, obfuscation_code, study_location, release_date, submission_date, \ study_status = wsc.get_permissions(study_id, user_token) if not write_access: abort(403) investigation = None metaspace_projects = None metaspace_api_key = None metaspace_password = None metaspace_email = None metaspace_datasets = None # body content validation if request.data: try: data_dict = json.loads(request.data.decode('utf-8')) project = data_dict['project'] if project: if "metaspace-api-key" in project: metaspace_api_key = project['metaspace-api-key'] if "metaspace-password" in project: metaspace_password = project['metaspace-password'] if "metaspace-email" in project: metaspace_email = project['metaspace-email'] if "metaspace-datasets" in project: metaspace_datasets = project['metaspace-datasets'] logger.info('Requesting METASPACE datasets ' + metaspace_datasets) if "metaspace-projects" in project: metaspace_projects = project['metaspace-projects'] logger.info('Requesting METASPACE projects ' + metaspace_projects) # study_location = os.path.join(study_location, 'METASPACE') sm = SMInstance() if metaspace_api_key: """ Log in with API key Users can generate an API key in the "API access" section of https://metaspace2020.eu/user/me If you're connecting to our GraphQL API directly, API key authentication requires an HTTP header "Authorization: Api-Key " followed by the key. """ sm.login(email=None, password=None, api_key=metaspace_api_key) # logged_id = sm.logged_in elif metaspace_password and metaspace_email: sm.login(email=metaspace_email, password=metaspace_password, api_key=None) else: abort(406, "No METASPACE API key or username/password provided.") if not os.path.isdir(study_location): os.makedirs(study_location, exist_ok=True) # Annotate the METASPACE project and return all relevant dataset and project ids metaspace_project_ids, metaspace_dataset_ids = \ annotate_metaspace(study_id=study_id, sm=sm, metaspace_projects=metaspace_projects, metaspace_datasets=metaspace_datasets) investigation = import_metaspace(study_id=study_id, dataset_ids=metaspace_dataset_ids, study_location=study_location, user_token=user_token, obfuscation_code=obfuscation_code, sm_instance=sm) except KeyError: abort(406, "No 'project' parameter was provided.") except AttributeError as e: abort(417, "Missing attribute/element in JSON string" + str(e)) except Exception as e: abort(417, str(e)) if investigation: return {"Success": "METASPACE data imported successfully"} else: return {"Warning": "Please check if METASPACE data was successfully imported"}