def main(argv): searchClient = DiscoverySearchClient( 'https://ga4gh-search-adapter-presto-public.prod.dnastack.com') query = 'select id, disease from kidsfirst.ga4gh_tables.ncpi_disease' res = searchClient.runQuery(query) diseases = {} for r in res: disease = r[1] dName = disease['identifier'][0]['value'] code = disease['code']['coding'][0]['code'] text = disease['code']['text'] diseases[code] = text disease_df = pd.DataFrame.from_dict(diseases, orient='index', columns=['Term']) for k, v in diseases.items(): print(k, v) print("found {} disease records".format(len(res))) print("There were {} disease codes used".format(len(diseases))) disease_df.to_csv('~/ncpi_kf_disease_terms.tsv', sep='\t')
def main(argv): searchClient = DiscoverySearchClient( 'https://ga4gh-search-adapter-presto-public.prod.dnastack.com') query = """select id, patient from kidsfirst.ga4gh_tables.patient where json_extract_scalar(patient, '$.extension[0].url') = 'http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity' limit 3""" #TODO query on the value of ethnicity showDetail = True res = searchClient.runQuery(query) if showDetail: print(json.dumps(res, indent=2)) for r in res: patient = r[1] print(patient['id'], patient['gender']) for e in patient['extension']: print(e['url']) print(e['extension'][0]['url']) vc = e['extension'][0]['valueCoding'] print(vc['code'], vc['display'])
def main(argv): searchClient = DiscoverySearchClient( 'https://ga4gh-search-adapter-presto-public.prod.dnastack.com/') #query = "select id, phenopacket from sample_phenopackets.ga4gh_tables.gecco_phenopackets limit 10" query = "select id from sample_phenopackets.ga4gh_tables.gecco_phenopackets where json_extract_scalar(phenopacket, '$.subject.sex') = 'MALE'" bqSearchClient = BigQuerySearchClient() #query = "select id, phenopacket from sample_phenopackets.ga4gh_tables.gecco_phenopackets limit 10" crdcquery = """ SELECT BioSample_Accession id FROM `isbcgc-216220.GECCO_CRC_Susceptibility.Subject_Phenotypes` sp join `isbcgc-216220.GECCO_CRC_Susceptibility.Sample_MULTI` sm on sm.dbgap_subject_id = sp.dbgap_subject_id and sex = 'Male' """ dbList = [] results = bqSearchClient.runQuery(crdcquery) print(len(results)) for r in results: dbList.append(r['id']) ppList = [] query_job = searchClient.runQuery(query) # Send the query print(len(query_job)) for r in query_job: ppList.append(r[0]) # compare the lists dbList.sort() ppList.sort() if dbList == ppList: print("The lists dbList and ppList are the same") else: print("The lists dbList and ppList are not the same")
def main(argv): searchClient = DiscoverySearchClient('https://ga4gh-search-adapter-presto-public.prod.dnastack.com') query = 'select id, patient from kidsfirst.ga4gh_tables.patient limit 3' res = searchClient.runQuery(query) print(json.dumps(res, indent=2))
def main(argv): searchClient = DiscoverySearchClient( 'https://ga4gh-search-adapter-presto-public.prod.dnastack.com', debug=False) query = """SELECT s.su_submitter_id, drs_id FROM thousand_genomes.onek_genomes.ssd_drs s join thousand_genomes.onek_genomes.sra_drs_files f on f.sample_name = s.su_submitter_id where filetype = 'bam' and mapped = 'mapped' and sequencing_type ='exome' and population = 'JPT' LIMIT 3""" searchClient.runQuery(query)
def main(argv): # Step 1 - Discovery # query for relevant DRS objects searchClient = DiscoverySearchClient( 'https://ga4gh-search-adapter-presto-public.prod.dnastack.com') query = """select submitter_id, read_drs_id drsid from thousand_genomes.onek_genomes.ssd_drs where population = 'ACB' limit 3""" res = searchClient.runQuery(query) print(res)
def main(): searchClient = DiscoverySearchClient( 'https://ga4gh-search-adapter-presto-public.prod.dnastack.com') query = """SELECT sample_submitter_id, fileid, filename FROM dbgap_demo.scr_ega.scr_egapancreatic_sample_multi p join dbgap_demo.scr_ega.scr_egapancreatic_files f on f.sample_primary_id = p.sample_primary_id where phenotype = 'pancreatic adenocarcinoma' limit 3""" query_job = searchClient.runQuery(query) for row in query_job: print("sample={}, EGAFileID={}".format(row[0], row[1]))
def main(argv): faspRunner = FASPRunner(pauseSecs=0) creditor = faspRunner.creditor # Step 1 - Discovery # query for relevant DRS objects searchClient = DiscoverySearchClient( 'https://ga4gh-search-adapter-presto-public.prod.dnastack.com/', debug=False) query = "SELECT file_name, compact_drs_id, hostbased_drs_id, drs_id from thousand_genomes.onek_genomes.onek_recal_variants_drs where chromosome = 'chr21' and annotated = false" print(query) query_job = searchClient.runQuery(query) # Send the query creditor.creditClass(searchClient) # Step 2 - DRS - use the MetaResolver send drs ids to the right service drsResolver = DRSMetaResolver(getReg=False) # Step 3 - set up a class that run a compute for us wesClient = DNAStackWESClient('~/.keys/dnastack_wes_credentials.json') # repeat steps 2 and 3 for each row of the query # this example should find id's for the same file in both BioDataCatalyst and Anvil for row in query_job: drs_id = row[1] print("vcffile={}, compact drsID={}".format(row[0], drs_id)) # Step 2 - Use DRS to get the URL objInfo = drsResolver.getObject(drs_id) drsClient, localid = drsResolver.getClient(drs_id) print(drsClient) creditor.creditClass(drsClient) fileSize = objInfo['size'] vcfurl = drsResolver.getAccessURL(drs_id, 'gs') # Step 3 - Run a pipeline on the file at the drs url pipeline_id = wesClient.runGWASWorkflow( vcfurl, 'gs://dnastack-public-bucket/thousand_genomes_meta.csv') creditor.creditClass(wesClient) print('submitted:{}'.format(pipeline_id)) outfile = '' via = 'WES' note = 'GWAS' time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S") faspRunner.logRun(time, via, note, pipeline_id, outfile, str(fileSize), searchClient, drsClient, wesClient)
def main(argv): # Step 1 - Discovery # query for relevant DRS objects searchClient = DiscoverySearchClient( 'https://ga4gh-search-adapter-presto-public.prod.dnastack.com') query = """select submitter_id, read_drs_id drsid from thousand_genomes.onek_genomes.ssd_drs where population = 'ACB' limit 1""" query_job = searchClient.runQuery(query) for row in query_job: print("subject={}, drsID={}".format(row[0], row[1]))
def main(argv): # Step 1 - Discovery # query for relevant DRS objects discoveryClients = { "sb": DiscoverySearchClient( 'https://ga4gh-search-adapter-presto-public.prod.dnastack.com'), "bdc": BigQuerySearchClient() } crdcquery = """SELECT sp.dbGaP_Subject_ID, 'sb:'||sb_drs_id FROM dbgap_demo.scr_gecco_susceptibility.subject_phenotypes_multi sp join dbgap_demo.scr_gecco_susceptibility.sample_multi sm on sm.dbgap_subject_id = sp.dbgap_subject_id join dbgap_demo.scr_gecco_susceptibility.sb_drs_index di on di.sample_id = sm.sample_id where AGE between 45 and 55 and sex = 'Female' and file_type = 'cram' limit 3""" bdcquery = """ SELECT sp.dbGaP_Subject_ID, 'bdc:'||read_drs_id FROM `isbcgc-216220.COPDGene.Subject_MULTI` sm join `isbcgc-216220.COPDGene.Subject_Phenotypes_HMB` sp on sp.dbgap_subject_id = sm.dbgap_subject_id join `isbcgc-216220.COPDGene.COPD_DRS` drs on drs.su_submitter_id = sm.subject_id where gender = '2' and Age_Enroll between 45 and 55 LIMIT 3""" results = discoveryClients['sb'].runQuery(crdcquery) # Send the query results += discoveryClients['bdc'].runQuery(bdcquery) # repeat steps 2 and 3 for each row of the query for row in results: print("subject={}, drsID={}".format(row[0], row[1]))
def main(argv): # Step 1 - Discovery # query for relevant DRS objects searchClient = DiscoverySearchClient( 'https://ga4gh-search-adapter-presto-public.prod.dnastack.com/') query = "select submitter_id, read_drs_id drsid from thousand_genomes.onek_genomes.ssd_drs where population = 'ACB' limit 1" query_job = searchClient.runQuery(query) # Step 2 - DRS - set up a DRS Client # CRDC drsClient = sdlDRSClient('~/.keys/prj_11218_D17199.ngc') # Step 3 - set up a class that run a compute for us wesClient = DNAStackWESClient('~/.keys/DNAStackWESkey.json') # A log is helpful to keep track of the computes we've submitted faspRunner = FASPRunner() # repeat steps 2 and 3 for each row of the query for row in query_job: print("subject={}, drsID={}".format(row[0], row[1])) # Step 2 - Use DRS to get the URL #objInfo = drsClient.getObject(row[1]) # for testing acc = 'SRR5368359.sra' objInfo = drsClient.getObject(acc) fileSize = objInfo['size'] print(fileSize) # we've predetermined we want to use the gs copy in this case #url = drsClient.getAccessURL(row[1], 'gs') res = drsClient.getAccessURL(acc, 'gs.us') url = res['url'] print(url) # Step 3 - Run a pipeline on the file at the drs url outfile = "{}.txt".format(row[0]) pipeline_id = wesClient.runWorkflow(url, outfile) print('submitted:{}'.format(pipeline_id)) via = 'WES' note = 'WES MD5 on NCBI SDL' time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S") faspRunner.logRun(time, via, note, pipeline_id, outfile, str(fileSize), searchClient, drsClient, wesClient)
def main(argv): faspRunner = FASPRunner(pauseSecs=0) creditor = faspRunner.creditor settings = faspRunner.settings # Step 1 - Discovery # query for relevant files searchClient = DiscoverySearchClient('https://ga4gh-search-adapter-presto-public.prod.dnastack.com/') query = "SELECT sample_submitter_id, fileid, filename FROM dbgap_demo.scr_ega.scr_egapancreatic_sample_multi p join dbgap_demo.scr_ega.scr_egapancreatic_files f on f.sample_primary_id = p.sample_primary_id where phenotype = 'pancreatic adenocarcinoma' limit 3" query_job = searchClient.runQuery(query) # Step 2 - Use htsget at EGA htsgetClient = EGAhtsget('~/.keys/ega.credentials') # Step 3 - set up a class that run a compute for us location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion']) wesClient = GCPLSsamtools(location, settings['GCPOutputBucket']) # repeat steps 2 and 3 for each row of the query for row in query_job: print("sample={}, EGAFileID={}".format(row[0], row[1])) # Step 2 - Use DRS to get the URL fileSize = htsgetClient.getSize(row[1]) print(fileSize) # we've predetermined we want to use the gs copy in this case #url = drsClient.getAccessURL(row[1], 'gs') #htsgetClient.htsget(row[1], 'chr1', 100000, 102000, 'BAM', row[2]) localfile = 'NA19377.unmapped.ILLUMINA.bwa.LWK.low_coverage.20120522.bam' #row[2] # Step 3 - Run a pipeline on the file at the drs url outfile = "{}.txt".format(row[0]) pipeline_id = wesClient.runWorkflow(localfile, outfile) #print('submitted:{}'.format(pipeline_id)) via = 'local' note = 'samtools on htsget BAM' time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S") faspRunner.logRun(time, via, note, pipeline_id, outfile, str(fileSize), searchClient, htsgetClient, wesClient)
def main(argv): # Step 1 - Discovery # query for relevant DRS objects searchClient = DiscoverySearchClient( 'https://search-presto-public-covid19.prod.dnastack.com') # List tables #searchClient.listTables() # List table schema #searchClient.listTableInfo('coronavirus_dnastack_curated.covid_cloud_production.sequences') query = 'select accession, biosample, genus, species from coronavirus_dnastack_curated.covid_cloud_production.sequences limit 10' res = searchClient.runQuery(query, returnType='dataframe') print(res)
def main(argv): searchClient = DiscoverySearchClient( 'https://ga4gh-search-adapter-presto-public.prod.dnastack.com') query = """select id, patient from kidsfirst.ga4gh_tables.patient where json_extract_scalar(patient, '$.gender') = 'female' limit 3""" showDetail = True res = searchClient.runQuery(query) if showDetail: print(json.dumps(res, indent=2)) for r in res: patient = r[1] print(patient['id'], patient['gender'])
def main(argv): # Step 1 - Discovery # query for relevant DRS objects searchClient = DiscoverySearchClient( 'https://search-presto-public-covid19.prod.dnastack.com') # List tables tList = searchClient.listTables(verbose=False) # List table schema for t in tList: res = searchClient.listTableInfo(t, verbose=False) print(t) if 'data_model' in res: print(res['data_model']['description']) else: print('No data model')
def main(argv): pp_dbgap_join = """SELECT sp.dbGaP_Subject_ID, 'sbcgc:'||sb_drs_id FROM dbgap_demo.scr_gecco_susceptibility.subject_phenotypes_multi sp join dbgap_demo.scr_gecco_susceptibility.sample_multi sm on sm.dbgap_subject_id = sp.dbgap_subject_id join dbgap_demo.scr_gecco_susceptibility.sb_drs_index di on di.sample_id = sm.sample_id join sample_phenopackets.ga4gh_tables.gecco_phenopackets pp on pp.id = sm.biosample_accession where json_extract_scalar(pp.phenopacket, '$.subject.sex') = 'MALE' and file_type = 'cram' limit 3""" # Step 1 - Discovery # query for relevant DRS objects searchClient = DiscoverySearchClient( 'https://ga4gh-search-adapter-presto-public.prod.dnastack.com', debug=False) results = searchClient.runQuery(pp_dbgap_join) # repeat steps 2 and 3 for each row of the query for row in results: print("subject={}, drsID={}".format(row[0], row[1]))
def main(argv): # Step 1 - Discovery # query for relevant DRS objects searchClient = DiscoverySearchClient('https://ga4gh-search-adapter-presto-public.prod.dnastack.com') # List tables #searchClient.listTables() # List table schema #searchClient.listTableInfo('coronavirus_dnastack_curated.covid_cloud_production.sequences') query = """select id, patient from kidsfirst.ga4gh_tables.patient where json_extract_scalar(patient, '$.id') = '451133' limit 3""" res = searchClient.runQuery(query) print(json.dumps(res, indent=2))
def main(argv): faspRunner = FASPRunner(pauseSecs=0) settings = faspRunner.settings # Step 1 - Discovery # query for relevant DRS objects searchClient = DiscoverySearchClient('https://ga4gh-search-adapter-presto-public.prod.dnastack.com/', debug=True) query = "SELECT s.su_submitter_id, drs_id FROM thousand_genomes.onek_genomes.ssd_drs s join thousand_genomes.onek_genomes.sra_drs_files f on f.sample_name = s.su_submitter_id where filetype = 'bam' and mapped = 'mapped' and sequencing_type ='exome' and population = 'JPT' LIMIT 3" drsClient = DRSClient('https://locate.ncbi.nlm.nih.gov',access_id='2' ,debug=True, public=True) location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion']) mysam = GCPLSsamtools(location, settings['GCPOutputBucket']) faspRunner.configure(searchClient, drsClient, mysam) faspRunner.runQuery(query, 'One k query SRA DRS')
def main(argv): faspRunner = FASPRunner(pauseSecs=0) # Step 1 - Discovery # query for relevant DRS objects searchClient = DiscoverySearchClient( 'https://ga4gh-search-adapter-presto-public.prod.dnastack.com/') query = "select submitter_id, read_drs_id drsid from thousand_genomes.onek_genomes.ssd_drs where population = 'ACB' limit 3" # Step 2 - DRS - set up a DRS Client drsClient = bdcDRSClient('~/.keys/bdc_credentials.json', 'gs') # Step 3 - set up a class that run a compute for us wesClient = DNAStackWESClient('~/.keys/dnastack_wes_credentials.json') faspRunner.configure(searchClient, drsClient, wesClient) faspRunner.runQuery(query, 'One k query using Search and WES')
def main(argv): faspRunner = FASPRunner(pauseSecs=0) pp_dbgap_join = "SELECT sp.dbGaP_Subject_ID, 'sbcgc:'||sb_drs_id FROM dbgap_demo.scr_gecco_susceptibility.subject_phenotypes_multi sp join dbgap_demo.scr_gecco_susceptibility.sample_multi sm on sm.dbgap_subject_id = sp.dbgap_subject_id join dbgap_demo.scr_gecco_susceptibility.sb_drs_index di on di.sample_id = sm.sample_id join sample_phenopackets.ga4gh_tables.gecco_phenopackets pp on pp.id = sm.biosample_accession where json_extract_scalar(pp.phenopacket, '$.subject.sex') = 'MALE' and file_type = 'cram' limit 3" # Step 1 - Discovery # query for relevant DRS objects searchClient = DiscoverySearchClient( 'https://ga4gh-search-adapter-presto-public.prod.dnastack.com/', debug=True) # Step 2 - DRS - a metaresolver will deal with which drs server is required drsClient = DRSMetaResolver() # Step 3 - set up a class that run a compute for us wesClient = DNAStackWESClient('~/.keys/dnastack_wes_credentials.json') faspRunner.configure(searchClient, drsClient, wesClient) faspRunner.runQuery(pp_dbgap_join, 'Phenopacket Gecco')
def main(argv): faspRunner = FASPRunner(pauseSecs=0) settings =faspRunner.settings # Step 1 - Discovery # query for relevant DRS objects searchClient = DiscoverySearchClient('https://ga4gh-search-adapter-presto-public.prod.dnastack.com/') query = "select submitter_id, read_drs_id drsid from thousand_genomes.onek_genomes.ssd_drs where population = 'BEB' limit 3" # Step 2 - DRS - set up a DRS Client # CRDC drsClient = bdcDRSClient('~/.keys/bdc_credentials.json', 'gs') # Step 3 - set up a class that runs samtools for us # providing the location where we the results to go location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion']) mysam = GCPLSsamtools(location, settings['GCPOutputBucket']) faspRunner.configure(searchClient, drsClient, mysam) faspRunner.runQuery(query, 'One k query using Search')
def getMapping(searchClient, table, column): query = "select valueString, maptoValue from search_cloud.cshcodeathon.md_value_map where table_name = '{}' and column_name='{}'".format( table, column) mapping = searchClient.runQuery(query) mapDict = {} for row in mapping: mapDict[row[0]] = row[1] return mapDict searchClient = DiscoverySearchClient( 'https://ga4gh-search-adapter-presto-public.prod.dnastack.com', debug=False) table_name = 'search_cloud.cshcodeathon.organoid_profiling_pc_subject_phenotypes_gru' map_col = 'sex' mapping = getMapping(searchClient, table_name, map_col) print(mapping) res = searchClient.runOneTableQuery( column_list=['dbgap_subject_id', 'age', 'race', 'sex'], table=table_name, limit=100) print(res) res[map_col] = res[map_col].replace(mapping.keys(), mapping.values())
# imports from fasp.search import DiscoverySearchClient searchClient = DiscoverySearchClient('https://search-presto-public-covid19.prod.dnastack.com') res = searchClient.runOneTableQuery(column_list=['accession', 'biosample', 'genus', 'species'], table='coronavirus_dnastack_curated.covid_cloud_production.sequences', limit=15) print(res)
def main(argv): faspRunner = FASPRunner(pauseSecs=0) creditor = faspRunner.creditor settings = faspRunner.settings # set your Seven Bridges CGC project using what you have put in FASP Settings sbProject = settings['SevenBridgesProject'] sbInstance = settings['SevenBridgesInstance'] # Step 1 - Discovery # query for relevant DRS objects discoveryClients = { "sb": DiscoverySearchClient('https://ga4gh-search-adapter-presto-public.prod.dnastack.com/'), "bdc": BigQuerySearchClient() } crdcquery = "SELECT sp.dbGaP_Subject_ID, 'sb:'||sb_drs_id FROM dbgap_demo.scr_gecco_susceptibility.subject_phenotypes_multi sp join dbgap_demo.scr_gecco_susceptibility.sample_multi sm on sm.dbgap_subject_id = sp.dbgap_subject_id join dbgap_demo.scr_gecco_susceptibility.sb_drs_index di on di.sample_id = sm.sample_id where AGE between 45 and 55 and sex = 'Female' and file_type = 'cram' limit 3" bdcquery = """ SELECT sp.dbGaP_Subject_ID, 'bdc:'||read_drs_id FROM `isbcgc-216220.COPDGene.Subject_MULTI` sm join `isbcgc-216220.COPDGene.Subject_Phenotypes_HMB` sp on sp.dbgap_subject_id = sm.dbgap_subject_id join `isbcgc-216220.COPDGene.COPD_DRS` drs on drs.su_submitter_id = sm.subject_id where gender = '2' and Age_Enroll between 45 and 55 LIMIT 3""" results = discoveryClients['sb'].runQuery(crdcquery) # Send the query creditor.creditFromList('dbGapSSD') creditor.creditClass(discoveryClients['sb']) results += discoveryClients['bdc'].runQuery(bdcquery) creditor.creditFromList('BDCData') # Step 2 - DRS - set up DRS Clients drsClients = { "sb": sbcgcDRSClient('~/.keys/sevenbridges_keys.json', 's3'), "bdc": bdcDRSClient('~/.keys/bdc_credentials.json', 'gs') } print('setting credentials ') creditor.creditFromList('dbGaPFence') # Step 3 - set up a class that runs samtools for us # providing the location for the results location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion']) sam2 = GCPLSsamtools(location, settings['GCPOutputBucket']) samClients = { "sb": samtoolsSBClient(sbInstance, sbProject), "bdc": sam2 } # repeat steps 2 and 3 for each row of the query for row in results: print("subject={}, drsID={}".format(row[0], row[1])) resRow = [row[0], row[1]] # Step 2 - Use DRS to get the URL # get the prefix prefix, drsid = row[1].split(":", 1) drsClient = drsClients[prefix] searchClient = discoveryClients[prefix] creditor.creditClass(drsClient) url = drsClient.getAccessURL(drsid) print(url) #objInfo = drsClient.getObject(drsid) #print (objInfo) #fileSize = objInfo['size'] fileSize = 0 # Step 3 - Run a pipeline on the file at the drs url if url != None: outfile = "{}.txt".format(row[0]) mysam = samClients[prefix] creditor.creditClass(mysam) via = 'sh' note = 'Two dbGaP sources' time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S") run_id = mysam.runWorkflow(url, outfile) faspRunner.logRun(time, via, note, run_id, outfile, fileSize, searchClient, drsClient, mysam) resRow.append('OK') else: print('could not get DRS url') resRow.append('unauthorized')