# for local dev, set credentials from dotenv import load_dotenv from os import environ load_dotenv() # host=environ['MOLGENIS_PROD_HOST'] host = environ['MOLGENIS_ACC_HOST'] rd3 = Molgenis(url=host) rd3.login(username=environ['MOLGENIS_ACC_USR'], password=environ['MOLGENIS_ACC_PWD']) # pull RD3 data files = rd3.get(entity='rd3_portal_cluster', q='type=="phenopacket"', attributes='release,name,type', batch_size=10000) subjects = rd3.get(entity='rd3_freeze1_subject', attributes='id,subjectID,patch', batch_size=10000) statusMsg('File metadata entries pulled: {}'.format(len(files))) statusMsg('Subject metadata entries pulled: {}'.format(len(subjects))) # extract subject ID for file in files: file['subject'] = re.sub( pattern=r'((.[0-9]{4}-[0-9]{2}-[0-9]{2})?(.json))$', repl='', string=file['name'])
# ///////////////////////////////////////////////////////////////////////////// # ~ 1 ~ # Start Molgenis Session and Pull Required Data # # In order to process new phenopacket files, it is important to compare values # with new exiting RD3 metadata. This allows us to import the values that have # changed rather than everything. Once the contents of the files have been # processed and evaluated, we can import them into RD3. These values will be # imported into the `subject` and `subjectinfo` tables. The attributes that # are managed by this script are listed in the GET requests below. # pull subject metadata for the current freeze freeze = rd3.get( entity=paths['rd3_subjects'], attributes= 'id,subjectID,clinical_status,disease,phenotype,hasNotPhenotype,phenopacketsID,patch', batch_size=10000) # pull subjectinfo data freeze_info = rd3.get(entity=paths['rd3_subjectinfo'], attributes='id,dateofBirth,ageOfOnset,patch', batch_size=10000) # extract subject IDs for later # freeze_ids = rd3tools.flatten_attr(freeze, 'id') freeze_ids = [row['id'] for row in freeze] # pull HPO and disease codes, and then flatten hpo_codes_raw = rd3.get(entity='rd3_phenotype', batch_size=10000) disease_codes_raw = rd3.get(entity='rd3_disease', batch_size=10000)
# migrate data from one server to the other: # pull data then switch tokens and restart connection # portalData = rd3.get(releaseName,batch_size=10000) # rd3.importData(entity='rd3_portal_release_freeze3', data=portalData) #////////////////////////////////////////////////////////////////////////////// # ~ 0 ~ # Create Reference Datasets # Pull reference tables to create mapping tables for recoding raw values into # RD3 terminology. Add additional mappings as needed. # ~ 0a ~ # Create ERN Mapping erns = dt.Frame(rd3.get('rd3_ERN')) del erns['_href'] # as key pair dictionary ernMappings = toKeyPairs(data=erns[:, { 'from': f.identifier, 'to': f.identifier }].to_pandas().to_dict('records'), keyAttr='from', valueAttr='to') # define additional ERN mappings based on past/present values the variation # must be mapped to an existing ERN identifier. The format you should use is: # `'variation' : 'RD3 ERN identifier'` ernMappings.update({ 'ERN-CRANIO': 'ERNCRANIO',
# Pull Data # The source of the novelomics releases come from rd3_portal_novelomics. Data # is sent from EGA and Tubingen, and sometimes supplied by CNAG. To run this # script, pull both novelomics portal tables, reference entities, and create # a list of existing subject and sample IDs. # # Pull mapping tables or define them below. # ~ 0a ~ # Pull portal tables # After the initial run, make sure the query param is uncommented. statusMsg('Pulling data from the portal....') shipment = dt.Frame( rd3.get(entity='rd3_portal_novelomics_shipment', q='processed==False', batch_size=10000)) experiment = dt.Frame( rd3.get(entity='rd3_portal_novelomics_experiment', q='processed==False', batch_size=10000)) del shipment['_href'] del experiment['_href'] # ~ 0b ~ # Build Patch Information # Determine if there are any new releases based on type of analysis. If there # are, stop this script and complete the following # 1. Determine if this is an actual new study or if this data should be
'sample_id': 'sampleID', 'participant_subject': 'subjectID', 'pathological state': 'pathologicalState', 'tumor cell fraction': 'percentageTumorCells' } newData[:, dt.update(sampleID=as_type(f.sampleID, str))] newData.key = 'sampleID' # ~ 1b ~ # Pull the deepwes data from RD3 # Unnest reference attributes and set key samples = rd3.get(entity='rd3_noveldeepwes_sample', attributes='id,sampleID,subject', batch_size=10000) for row in samples: row['subject'] = row['subject']['subjectID'] samples = dt.Frame(samples) del samples['_href'] samples.key = 'sampleID' # ~ 1c ~ # Join datasets newSamplesData = samples[:, :, dt.join(newData)] # recode attribute
# RD3 `rd3_freeze[x]_subject` where `[x]` is the freeze that the new PED files # are tied to (e.g., `rd3_freeze2_subject`). # # - `id`: the molgenis row ID; a concatenation of subject ID and release # - `subjectID`: RD3 P number # - `sex`: patient's sex # - `fid`: family ID # # It isn't necessary to run extensive checks that compare PED file data with # the values that are in RD3 as PED files should be considered the most # up to date. # pull subject metadata for the current freeze freeze_subject_metadata = rd3.get( entity=paths['rd3_subjects'], # q = 'patch=freeze1_patch1', attributes='id,subjectID,sex1,fid', batch_size=10000) # flatten subjectIDs for faster comparison later on subject_ids = [row['subjectID'] for row in freeze_subject_metadata] # In addition to subject metadata, it is import to pull file metadata to identify # which have changed and should be processed. We will pull the following # attributes: # # - `EGA`: the EGA file ID # - `name`: the full name of the file # - `md5`: checksum #
availableReleases = regularReleases + novelomicsReleases statusMsg('Pulling metadata....') # fetch subject metadata subjects=[] for release in availableReleases: statusMsg('Fetching subject metadata for',release) data=rd3.get( entity=f"rd3_{release}_subject", batch_size=10000, attributes=','.join([ 'id', 'subjectID', 'sex1', 'fid', 'mid', 'pid', 'clinical_status', 'disease', 'phenotype', 'hasNotPhenotype', 'organisation', 'ERN', 'solved', 'patch' ]) ) # clean data for row in data: row['sex1']=row.get('sex1',{}).get('identifier') row['mid']=row.get('mid',{}).get('id') row['pid']=row.get('pid',{}).get('id') if row.get('disease'): row['disease']=','.join([record['id'] for record in row['disease']]) else: