def main(argv): faspRunner = FASPRunner(pauseSecs=0) settings = faspRunner.settings # Step 1 - Discovery # query for relevant DRS objects searchClient = BigQuerySearchClient() # query = """ # SELECT subject_id, read_drs_id # FROM `isbcgc-216220.COPDGene.phenotype_drs` # where weight_kg between 91.8 and 93.0 # LIMIT 1""" query = """ SELECT submitter_id, read_drs_id FROM `isbcgc-216220.onek_genomes.ssd_drs` where population = 'BEB' LIMIT 1""" # BioDataCatalyst drsClient = bdcDRSClient('~/.keys/bdc_credentials.json', 'gs') location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion']) mysam = GCPLSsamtools(location, settings['GCPOutputBucket']) faspRunner.configure(searchClient, drsClient, mysam) faspRunner.runQuery(query, 'One k query ')
def main(argv): faspRunner = FASPRunner() settings = faspRunner.settings logTable = pd.read_table(faspRunner.pipelineLogFile, dtype={'status': str}) sbSystem = settings['SevenBridgesInstance'] sbProject = settings['SevenBridgesProject'] location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion']) gcsam = GCPLSsamtools(location, settings['GCPOutputBucket']) wesClients = { 'samtoolsSBClient': samtoolsSBClient(sbSystem, sbProject), 'DNAStackWESClient': DNAStackWESClient('~/.keys/DNAStackWESkey.json'), 'GCPLSsamtools': gcsam } for i, row in logTable.iterrows(): wesClientClassName = row["wesClient"] run_id = row["pipeline_id"] if run_id == 'paste here': logTable.at[i, 'status'] = 0 else: if pd.isna(row["status"]) or row["status"].lower() == 'running': wc = wesClients[wesClientClassName] status = wc.getTaskStatus(row["pipeline_id"]) print('Updated run:{} status:{}'.format(run_id, status)) logTable.at[i, 'status'] = status #logTable.to_csv('pipeline_w_status.txt', sep='\t', index=False) logTable.to_csv(faspRunner.pipelineLogFile, sep='\t', index=False)
def main(argv): faspRunner = FASPRunner(pauseSecs=0) settings = faspRunner.settings # Step 1 - Discovery # query for relevant DRS objects searchClient = BigQuerySearchClient() query = """ SELECT s.sample_name, drs_id, s.acc, assay_type, filename, FROM `nih-sra-datastore.sra.metadata` s, unnest(attributes) att join `isbcgc-216220.onek_genomes.sra_drs_files` d on d.acc = s.acc where filetype = 'bam' and mapped = 'mapped' and sequencing_type ='exome' and att.k = 'population_sam' and att.v = 'JPT' LIMIT 3""" #drsClient = DRSMetaResolver() drsClient = DRSClient('https://locate.ncbi.nlm.nih.gov',access_id='2', public=True) location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion']) mysam = GCPLSsamtools(location, settings['GCPOutputBucket']) faspRunner.configure(searchClient, drsClient, mysam) faspRunner.runQuery(query, 'One k query SRA DRS')
def main(argv): faspRunner = FASPRunner(pauseSecs=0) creditor = faspRunner.creditor settings = faspRunner.settings # set your Seven Bridges CGC project using what you have put in FASP Settings sbProject = settings['SevenBridgesProject'] sbInstance = settings['SevenBridgesInstance'] # Step 1 - Discovery # query for relevant DRS objects searchClient = BigQuerySearchClient() # Step 2 - DRS - set up a DRS Client drsClient = crdcDRSClient('~/.keys/crdc_credentials.json', 's3') # Step 3 - set up a class that runs samtools for us location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion']) sam2 = GCPLSsamtools(location, settings['GCPOutputBucket']) mysams = {'s3': samtoolsSBClient(sbInstance, sbProject), 'gs': sam2} query = """ SELECT 'case_'||associated_entities__case_gdc_id , file_id FROM `isb-cgc.GDC_metadata.rel24_fileData_active` where data_format = 'BAM' and project_disease_type = 'Breast Invasive Carcinoma' limit 3""" print(query) query_job = searchClient.runQuery(query) # Send the query creditor.creditFromList('ISBGDCData') # repeat steps 2 and 3 for each row of the query for row in query_job: print("subject={}, drsID={}".format(row[0], row[1])) # Step 2 - Use DRS to get the URL objInfo = drsClient.getObject(row[1]) creditor.creditClass(drsClient) fileSize = objInfo['size'] outfile = "{}.txt".format(row[0]) # submit to both aws and gcp for cl, mysam in mysams.items(): url = drsClient.getAccessURL(row[1], cl) # Step 3 - Run a pipeline on the file at the drs url creditor.creditClass(mysam) task_id = mysam.runWorkflow(url, outfile) via = 'py' note = 'double submit' time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S") faspRunner.logRun(time, via, note, task_id, outfile, str(fileSize), searchClient, drsClient, mysam) creditor.creditFromList('FASPScript8_sdrf', closeImage=False)
def main(argv): faspRunner = FASPRunner(pauseSecs=0) # Step 1 - Discovery # query for relevant DRS objects searchClient = BigQuerySearchClient() query = """ SELECT 'case_'||associated_entities__case_gdc_id , file_id FROM `isb-cgc.GDC_metadata.rel24_fileData_active` where data_format = 'BAM' and project_disease_type = 'Breast Invasive Carcinoma' limit 3""" drsClient = crdcDRSClient('~/.keys/crdc_credentials.json', 's3') # Step 3 - set up a class that runs samtools for us sbProject = faspRunner.settings['SevenBridgesProject'] sbInst = faspRunner.settings['SevenBridgesInstance'] mysam = samtoolsSBClient(sbInst, sbProject) faspRunner.configure(searchClient, drsClient, mysam) faspRunner.runQuery(query, 'GDC query SB compute')
def main(argv): # edit the following line for where you put your ngc credentials file from dbGaP credentials_file = '~/.keys/prj_14565.ngc' faspRunner = FASPRunner(pauseSecs=0) settings = faspRunner.settings # Step 1 - Discovery # query for relevant DRS objects searchClient = localSearchClient() query_job = searchClient.runQuery('') drsClient = drsClient = sdlDRSClient(credentials_file, debug=True) location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion']) mysam = GCPLSsamtools(location, settings['GCPOutputBucket'], debug=True) faspRunner = FASPRunner() # repeat steps 2 and 3 for each row of the query for row in query_job: print("subject={}, drsID={}".format(row[0], row[1])) # Step 2 - Use DRS to get the URL #objInfo = drsClient.getObject(row[1]) # for testing objInfo = drsClient.getObject(row[1]) fileSize = objInfo['size'] print(fileSize) # we've predetermined we want to use the gs copy in this case #url = drsClient.getAccessURL(row[1], 'gs') res = drsClient.getAccessURL(row[1], 'gs.us') url = res['url'] print(url) # Step 3 - Run a pipeline on the file at the drs url outfile = "{}.txt".format(row[0]) pipeline_id = mysam.runWorkflow(url, outfile) print('submitted:{}'.format(pipeline_id)) via = '' note = 'Anvil GTEX Test via SDL' time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S") faspRunner.logRun(time, via, note, pipeline_id, outfile, str(fileSize), searchClient, drsClient, mysam)
def main(argv): faspRunner = FASPRunner(pauseSecs=0) creditor = faspRunner.creditor # Step 1 - Discovery # query for relevant DRS objects searchClient = DiscoverySearchClient( 'https://ga4gh-search-adapter-presto-public.prod.dnastack.com/', debug=False) query = "SELECT file_name, compact_drs_id, hostbased_drs_id, drs_id from thousand_genomes.onek_genomes.onek_recal_variants_drs where chromosome = 'chr21' and annotated = false" print(query) query_job = searchClient.runQuery(query) # Send the query creditor.creditClass(searchClient) # Step 2 - DRS - use the MetaResolver send drs ids to the right service drsResolver = DRSMetaResolver(getReg=False) # Step 3 - set up a class that run a compute for us wesClient = DNAStackWESClient('~/.keys/dnastack_wes_credentials.json') # repeat steps 2 and 3 for each row of the query # this example should find id's for the same file in both BioDataCatalyst and Anvil for row in query_job: drs_id = row[1] print("vcffile={}, compact drsID={}".format(row[0], drs_id)) # Step 2 - Use DRS to get the URL objInfo = drsResolver.getObject(drs_id) drsClient, localid = drsResolver.getClient(drs_id) print(drsClient) creditor.creditClass(drsClient) fileSize = objInfo['size'] vcfurl = drsResolver.getAccessURL(drs_id, 'gs') # Step 3 - Run a pipeline on the file at the drs url pipeline_id = wesClient.runGWASWorkflow( vcfurl, 'gs://dnastack-public-bucket/thousand_genomes_meta.csv') creditor.creditClass(wesClient) print('submitted:{}'.format(pipeline_id)) outfile = '' via = 'WES' note = 'GWAS' time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S") faspRunner.logRun(time, via, note, pipeline_id, outfile, str(fileSize), searchClient, drsClient, wesClient)
def main(argv): # Step 1 - Discovery # query for relevant DRS objects searchClient = DiscoverySearchClient( 'https://ga4gh-search-adapter-presto-public.prod.dnastack.com/') query = "select submitter_id, read_drs_id drsid from thousand_genomes.onek_genomes.ssd_drs where population = 'ACB' limit 1" query_job = searchClient.runQuery(query) # Step 2 - DRS - set up a DRS Client # CRDC drsClient = sdlDRSClient('~/.keys/prj_11218_D17199.ngc') # Step 3 - set up a class that run a compute for us wesClient = DNAStackWESClient('~/.keys/DNAStackWESkey.json') # A log is helpful to keep track of the computes we've submitted faspRunner = FASPRunner() # repeat steps 2 and 3 for each row of the query for row in query_job: print("subject={}, drsID={}".format(row[0], row[1])) # Step 2 - Use DRS to get the URL #objInfo = drsClient.getObject(row[1]) # for testing acc = 'SRR5368359.sra' objInfo = drsClient.getObject(acc) fileSize = objInfo['size'] print(fileSize) # we've predetermined we want to use the gs copy in this case #url = drsClient.getAccessURL(row[1], 'gs') res = drsClient.getAccessURL(acc, 'gs.us') url = res['url'] print(url) # Step 3 - Run a pipeline on the file at the drs url outfile = "{}.txt".format(row[0]) pipeline_id = wesClient.runWorkflow(url, outfile) print('submitted:{}'.format(pipeline_id)) via = 'WES' note = 'WES MD5 on NCBI SDL' time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S") faspRunner.logRun(time, via, note, pipeline_id, outfile, str(fileSize), searchClient, drsClient, wesClient)
def main(argv): faspRunner = FASPRunner(pauseSecs=0) # Step 1 - Discovery # query for relevant DRS objects searchClient = BigQuerySearchClient() query = """SELECT sra.biosample, sra.acc||'.cram' FROM `isbcgc-216220.GECCO_CRC_Susceptibility.Subject_Phenotypes` sp join `isbcgc-216220.GECCO_CRC_Susceptibility.Sample_MULTI` sm on sm.dbgap_subject_id = sp.dbgap_subject_id join `nih-sra-datastore.sra.metadata` sra on sm.BioSample_Accession = sra.biosample where AGE between 45 and 55 and sex = 'Female' limit 3""" query_job = searchClient.runQuery(query) # Step 2 - DRS - set up a DRS Client # CRDC drsClient = sdlDRSClient('~/.keys/prj_14565.ngc', True) # Step 3 - set up a class that run a compute for us wesClient = DNAStackWESClient('~/.keys/dnastack_wes_credentials.json') # repeat steps 2 and 3 for each row of the query for row in query_job: print("sample={}, drsID={}".format(row[0], row[1])) # Step 2 - Use DRS to get the URL objInfo = drsClient.getObject(row[1]) fileSize = objInfo['size'] print(fileSize) # we've predetermined we want to use the gs copy in this case #url = drsClient.getAccessURL(row[1], 'gs') res = drsClient.getAccessURL(row[1],'gs.us') url = res['url'] print(url) # Step 3 - Run a pipeline on the file at the drs url outfile = "{}.txt".format(row[0]) pipeline_id = wesClient.runWorkflow(url, outfile) print('submitted:{}'.format(pipeline_id)) via = 'WES' note = 'WES MD5 on NCBI SDL' time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S") faspRunner.logRun(time, via, note, pipeline_id, outfile, str(fileSize), searchClient, drsClient, wesClient)
def main(argv): faspRunner = FASPRunner(pauseSecs=0) creditor = faspRunner.creditor settings = faspRunner.settings # Step 1 - Discovery # query for relevant files searchClient = DiscoverySearchClient('https://ga4gh-search-adapter-presto-public.prod.dnastack.com/') query = "SELECT sample_submitter_id, fileid, filename FROM dbgap_demo.scr_ega.scr_egapancreatic_sample_multi p join dbgap_demo.scr_ega.scr_egapancreatic_files f on f.sample_primary_id = p.sample_primary_id where phenotype = 'pancreatic adenocarcinoma' limit 3" query_job = searchClient.runQuery(query) # Step 2 - Use htsget at EGA htsgetClient = EGAhtsget('~/.keys/ega.credentials') # Step 3 - set up a class that run a compute for us location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion']) wesClient = GCPLSsamtools(location, settings['GCPOutputBucket']) # repeat steps 2 and 3 for each row of the query for row in query_job: print("sample={}, EGAFileID={}".format(row[0], row[1])) # Step 2 - Use DRS to get the URL fileSize = htsgetClient.getSize(row[1]) print(fileSize) # we've predetermined we want to use the gs copy in this case #url = drsClient.getAccessURL(row[1], 'gs') #htsgetClient.htsget(row[1], 'chr1', 100000, 102000, 'BAM', row[2]) localfile = 'NA19377.unmapped.ILLUMINA.bwa.LWK.low_coverage.20120522.bam' #row[2] # Step 3 - Run a pipeline on the file at the drs url outfile = "{}.txt".format(row[0]) pipeline_id = wesClient.runWorkflow(localfile, outfile) #print('submitted:{}'.format(pipeline_id)) via = 'local' note = 'samtools on htsget BAM' time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S") faspRunner.logRun(time, via, note, pipeline_id, outfile, str(fileSize), searchClient, htsgetClient, wesClient)
def main(argv): faspRunner = FASPRunner(pauseSecs=0) settings = faspRunner.settings # Step 1 - Discovery # query for relevant DRS objects searchClient = DiscoverySearchClient('https://ga4gh-search-adapter-presto-public.prod.dnastack.com/', debug=True) query = "SELECT s.su_submitter_id, drs_id FROM thousand_genomes.onek_genomes.ssd_drs s join thousand_genomes.onek_genomes.sra_drs_files f on f.sample_name = s.su_submitter_id where filetype = 'bam' and mapped = 'mapped' and sequencing_type ='exome' and population = 'JPT' LIMIT 3" drsClient = DRSClient('https://locate.ncbi.nlm.nih.gov',access_id='2' ,debug=True, public=True) location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion']) mysam = GCPLSsamtools(location, settings['GCPOutputBucket']) faspRunner.configure(searchClient, drsClient, mysam) faspRunner.runQuery(query, 'One k query SRA DRS')
def main(argv): faspRunner = FASPRunner() settings = faspRunner.settings searchClient = Gen3ManifestClient( './fasp/data/gtex/gtex-cram-manifest.json') drsClient = anvilDRSClient('~/.keys/anvil_credentials.json', access_id='s3') wesClient = sbWESClient(settings['SevenBridgesInstance'], settings['SevenBridgesProject'], '~/.keys/sbcgc_key.json') faspRunner.configure(searchClient, drsClient, wesClient) faspRunner.runQuery(3, 'Anvil GTEX Test')
def main(argv): faspRunner = FASPRunner(pauseSecs=0) # Step 1 - Discovery # query for relevant DRS objects searchClient = DiscoverySearchClient( 'https://ga4gh-search-adapter-presto-public.prod.dnastack.com/') query = "select submitter_id, read_drs_id drsid from thousand_genomes.onek_genomes.ssd_drs where population = 'ACB' limit 3" # Step 2 - DRS - set up a DRS Client drsClient = bdcDRSClient('~/.keys/bdc_credentials.json', 'gs') # Step 3 - set up a class that run a compute for us wesClient = DNAStackWESClient('~/.keys/dnastack_wes_credentials.json') faspRunner.configure(searchClient, drsClient, wesClient) faspRunner.runQuery(query, 'One k query using Search and WES')
def main(argv): faspRunner = FASPRunner(pauseSecs=0) pp_dbgap_join = "SELECT sp.dbGaP_Subject_ID, 'sbcgc:'||sb_drs_id FROM dbgap_demo.scr_gecco_susceptibility.subject_phenotypes_multi sp join dbgap_demo.scr_gecco_susceptibility.sample_multi sm on sm.dbgap_subject_id = sp.dbgap_subject_id join dbgap_demo.scr_gecco_susceptibility.sb_drs_index di on di.sample_id = sm.sample_id join sample_phenopackets.ga4gh_tables.gecco_phenopackets pp on pp.id = sm.biosample_accession where json_extract_scalar(pp.phenopacket, '$.subject.sex') = 'MALE' and file_type = 'cram' limit 3" # Step 1 - Discovery # query for relevant DRS objects searchClient = DiscoverySearchClient( 'https://ga4gh-search-adapter-presto-public.prod.dnastack.com/', debug=True) # Step 2 - DRS - a metaresolver will deal with which drs server is required drsClient = DRSMetaResolver() # Step 3 - set up a class that run a compute for us wesClient = DNAStackWESClient('~/.keys/dnastack_wes_credentials.json') faspRunner.configure(searchClient, drsClient, wesClient) faspRunner.runQuery(pp_dbgap_join, 'Phenopacket Gecco')
def main(argv): # edit the following line for where you put your credentials file from anvil credentials_file = '~/.keys/anvil_credentials.json' faspRunner = FASPRunner(pauseSecs=0) settings = faspRunner.settings # Step 1 - Discovery # query for relevant DRS objects searchClient = localSearchClient() #drsClient = DRSMetaResolver() drsClient = anvilDRSClient(credentials_file, settings['GCPProject'], 'gs') location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion']) workflowClient = GCPLSsamtools(location, settings['GCPOutputBucket']) faspRunner.configure(searchClient, drsClient, workflowClient) faspRunner.runQuery(12, 'Anvil GTEX Test')
def main(argv): faspRunner = FASPRunner(pauseSecs=0) settings =faspRunner.settings # Step 1 - Discovery # query for relevant DRS objects searchClient = DiscoverySearchClient('https://ga4gh-search-adapter-presto-public.prod.dnastack.com/') query = "select submitter_id, read_drs_id drsid from thousand_genomes.onek_genomes.ssd_drs where population = 'BEB' limit 3" # Step 2 - DRS - set up a DRS Client # CRDC drsClient = bdcDRSClient('~/.keys/bdc_credentials.json', 'gs') # Step 3 - set up a class that runs samtools for us # providing the location where we the results to go location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion']) mysam = GCPLSsamtools(location, settings['GCPOutputBucket']) faspRunner.configure(searchClient, drsClient, mysam) faspRunner.runQuery(query, 'One k query using Search')
def main(argv): faspRunner = FASPRunner() settings = faspRunner.settings # Step 1 - Discovery # query for relevant DRS objects discoveryClients = { "crdc": BigQuerySearchClient(), "anv": Gen3ManifestClient('./fasp/data/gtex/gtex-cram-manifest_wCuries.json') } # TCGA Query - CRDC crdcquery = """ SELECT 'case_'||associated_entities__case_gdc_id , 'crdc:'||file_id FROM `isb-cgc.GDC_metadata.rel24_fileData_active` where data_format = 'BAM' and project_disease_type = 'Breast Invasive Carcinoma' limit 3""" # Run both queriues abd aggregate results results = discoveryClients['anv'].runQuery(3) # Send the query results += discoveryClients['crdc'].runQuery(crdcquery) # Step 2 - DRS - set up DRS Clients # TODO Use DRSMetaresolver so we don't have to build our own resolver in this code drsClients = { "crdc": crdcDRSClient('~/.keys/crdc_credentials.json', 'gs'), "anv": anvilDRSClient('~/.keys/anvil_credentials.json', settings['GCPProject'], 'gs') } # Step 3 - set up a class that runs samtools for us # providing the location for the results location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion']) wesClient = GCPLSsamtools(location, settings['GCPOutputBucket']) # repeat steps 2 and 3 for each row of the query for row in results: print("subject={}, drsID={}".format(row[0], row[1])) resRow = [row[0], row[1]] # Step 2 - Use DRS to get the URL # This is a local solution to resolve prefixed DRS ids, DRS Metarolver would be better # get the prefix prefix, drsid = row[1].split(":", 1) drsClient = drsClients[prefix] print('Sending id {} to {}'.format(drsid, drsClient.__class__.__name__)) url = drsClient.getAccessURL(drsid) objInfo = drsClient.getObject(drsid) #print (objInfo) fileSize = objInfo['size'] #fileSize = 0 # Step 3 - Run a pipeline on the file at the drs url if url != None: outfile = "{}.txt".format(row[0]) via = 'sh' note = 'GTEx and TCGA' time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S") run_id = wesClient.runWorkflow(url, outfile) searchClient = discoveryClients[prefix] faspRunner.logRun(time, via, note, run_id, outfile, fileSize, searchClient, drsClient, wesClient) resRow.append('OK') else: print('could not get DRS url') resRow.append('unauthorized')
''' Query to illustrate Anne's use case for variants related to a gene involved in a rare pediatric brain cancer''' # IMPORTS import sys from fasp.runner import FASPRunner # The implementations we're using from fasp.loc import crdcDRSClient from fasp.workflow import GCPLSsamtools from fasp.search import BigQuerySearchClient faspRunner = FASPRunner(pauseSecs=0) settings = faspRunner.settings searchClient = BigQuerySearchClient() drsClient = crdcDRSClient('~/.keys/crdc_credentials.json', 'gs') location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion']) mysam = GCPLSsamtools(location, settings['GCPOutputBucket']) faspRunner.configure(searchClient, drsClient, mysam) query = """ SELECT mut.case_barcode subject, meta.file_gdc_id as drs_id, meta.file_gdc_url as tumor_bam_file_path, clin.race, clin.age_at_diagnosis, clin.ethnicity FROM `isb-cgc.TCGA_hg38_data_v0.Somatic_Mutation` as mut join `isb-cgc.TCGA_bioclin_v0.Clinical` as clin on clin.case_barcode = mut.case_barcode join `isb-cgc.GDC_metadata.rel24_GDCfileID_to_GCSurl` as meta
def main(argv): faspRunner = FASPRunner(pauseSecs=0) creditor = faspRunner.creditor settings = faspRunner.settings # set your Seven Bridges CGC project using what you have put in FASP Settings sbProject = settings['SevenBridgesProject'] sbInstance = settings['SevenBridgesInstance'] # Step 1 - Discovery # query for relevant DRS objects discoveryClients = { "sb": DiscoverySearchClient('https://ga4gh-search-adapter-presto-public.prod.dnastack.com/'), "bdc": BigQuerySearchClient() } crdcquery = "SELECT sp.dbGaP_Subject_ID, 'sb:'||sb_drs_id FROM dbgap_demo.scr_gecco_susceptibility.subject_phenotypes_multi sp join dbgap_demo.scr_gecco_susceptibility.sample_multi sm on sm.dbgap_subject_id = sp.dbgap_subject_id join dbgap_demo.scr_gecco_susceptibility.sb_drs_index di on di.sample_id = sm.sample_id where AGE between 45 and 55 and sex = 'Female' and file_type = 'cram' limit 3" bdcquery = """ SELECT sp.dbGaP_Subject_ID, 'bdc:'||read_drs_id FROM `isbcgc-216220.COPDGene.Subject_MULTI` sm join `isbcgc-216220.COPDGene.Subject_Phenotypes_HMB` sp on sp.dbgap_subject_id = sm.dbgap_subject_id join `isbcgc-216220.COPDGene.COPD_DRS` drs on drs.su_submitter_id = sm.subject_id where gender = '2' and Age_Enroll between 45 and 55 LIMIT 3""" results = discoveryClients['sb'].runQuery(crdcquery) # Send the query creditor.creditFromList('dbGapSSD') creditor.creditClass(discoveryClients['sb']) results += discoveryClients['bdc'].runQuery(bdcquery) creditor.creditFromList('BDCData') # Step 2 - DRS - set up DRS Clients drsClients = { "sb": sbcgcDRSClient('~/.keys/sevenbridges_keys.json', 's3'), "bdc": bdcDRSClient('~/.keys/bdc_credentials.json', 'gs') } print('setting credentials ') creditor.creditFromList('dbGaPFence') # Step 3 - set up a class that runs samtools for us # providing the location for the results location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion']) sam2 = GCPLSsamtools(location, settings['GCPOutputBucket']) samClients = { "sb": samtoolsSBClient(sbInstance, sbProject), "bdc": sam2 } # repeat steps 2 and 3 for each row of the query for row in results: print("subject={}, drsID={}".format(row[0], row[1])) resRow = [row[0], row[1]] # Step 2 - Use DRS to get the URL # get the prefix prefix, drsid = row[1].split(":", 1) drsClient = drsClients[prefix] searchClient = discoveryClients[prefix] creditor.creditClass(drsClient) url = drsClient.getAccessURL(drsid) print(url) #objInfo = drsClient.getObject(drsid) #print (objInfo) #fileSize = objInfo['size'] fileSize = 0 # Step 3 - Run a pipeline on the file at the drs url if url != None: outfile = "{}.txt".format(row[0]) mysam = samClients[prefix] creditor.creditClass(mysam) via = 'sh' note = 'Two dbGaP sources' time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S") run_id = mysam.runWorkflow(url, outfile) faspRunner.logRun(time, via, note, run_id, outfile, fileSize, searchClient, drsClient, mysam) resRow.append('OK') else: print('could not get DRS url') resRow.append('unauthorized')
def main(argv): faspRunner = FASPRunner(pauseSecs=0) creditor = faspRunner.creditor settings = faspRunner.settings # Step 1 - Discovery # query for relevant DRS objects searchClient = BigQuerySearchClient() # TCGA Query - CRDC crdcquery = """ SELECT 'case_'||associated_entities__case_gdc_id , 'crdc:'||file_id FROM `isb-cgc.GDC_metadata.rel24_fileData_active` where data_format = 'BAM' and project_disease_type = 'Breast Invasive Carcinoma' limit 3""" #COPD query - Topmed bdcquery = """ SELECT SUBJECT_ID, 'bdc:'||read_drs_id FROM `isbcgc-216220.COPDGene.phenotype_drs` where Weight_KG between 92.5 and 93.0 LIMIT 3""" results = searchClient.runQuery(crdcquery) # Send the query creditor.creditFromList('ISBGDCData') results += searchClient.runQuery(bdcquery) creditor.creditFromList('BDCData') # Step 2 - DRS - set up DRS Clients drsClients = { "crdc": crdcDRSClient('~/.keys/crdc_credentials.json', ''), "bdc": bdcDRSClient('~/.keys/bdc_credentials.json', '') } print('setting credentials ') creditor.creditFromList('dbGaPFence') # Step 3 - set up a class that runs samtools for us # providing the location for the results location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion']) mysam = GCPLSsamtools(location, settings['GCPOutputBucket']) # repeat steps 2 and 3 for each row of the query for row in results: print("subject={}, drsID={}".format(row[0], row[1])) # Step 2 - Use DRS to get the URL # get the prefix prefix, drsid = row[1].split(":", 1) url = drsClients[prefix].getAccessURL(drsid, 'gs') drsClient = drsClients[prefix] creditor.creditClass(drsClient) objInfo = drsClient.getObject(drsid) fileSize = objInfo['size'] # Step 3 - Run a pipeline on the file at the drs url outfile = "{}.txt".format(row[0]) mysam.runWorkflow(url, outfile) creditor.creditClass(mysam) via = 'sh' pipeline_id = 'paste here' note = 'Two sources' time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S") faspRunner.logRun(time, via, note, pipeline_id, outfile, fileSize, searchClient, drsClient, mysam) creditor.creditFromList('FASPScript2_sdrf', closeImage=False)