def main(argv): faspRunner = FASPRunner(pauseSecs=0) # Step 1 - Discovery # query for relevant DRS objects searchClient = BigQuerySearchClient() query = """ SELECT 'case_'||associated_entities__case_gdc_id , file_id FROM `isb-cgc.GDC_metadata.rel24_fileData_active` where data_format = 'BAM' and project_disease_type = 'Breast Invasive Carcinoma' limit 3""" drsClient = crdcDRSClient('~/.keys/crdc_credentials.json', 's3') # Step 3 - set up a class that runs samtools for us sbProject = faspRunner.settings['SevenBridgesProject'] sbInst = faspRunner.settings['SevenBridgesInstance'] mysam = samtoolsSBClient(sbInst, sbProject) faspRunner.configure(searchClient, drsClient, mysam) faspRunner.runQuery(query, 'GDC query SB compute')
def main(argv): faspRunner = FASPRunner(pauseSecs=0) creditor = faspRunner.creditor settings = faspRunner.settings # set your Seven Bridges CGC project using what you have put in FASP Settings sbProject = settings['SevenBridgesProject'] sbInstance = settings['SevenBridgesInstance'] # Step 1 - Discovery # query for relevant DRS objects searchClient = BigQuerySearchClient() # Step 2 - DRS - set up a DRS Client drsClient = crdcDRSClient('~/.keys/crdc_credentials.json', 's3') # Step 3 - set up a class that runs samtools for us location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion']) sam2 = GCPLSsamtools(location, settings['GCPOutputBucket']) mysams = {'s3': samtoolsSBClient(sbInstance, sbProject), 'gs': sam2} query = """ SELECT 'case_'||associated_entities__case_gdc_id , file_id FROM `isb-cgc.GDC_metadata.rel24_fileData_active` where data_format = 'BAM' and project_disease_type = 'Breast Invasive Carcinoma' limit 3""" print(query) query_job = searchClient.runQuery(query) # Send the query creditor.creditFromList('ISBGDCData') # repeat steps 2 and 3 for each row of the query for row in query_job: print("subject={}, drsID={}".format(row[0], row[1])) # Step 2 - Use DRS to get the URL objInfo = drsClient.getObject(row[1]) creditor.creditClass(drsClient) fileSize = objInfo['size'] outfile = "{}.txt".format(row[0]) # submit to both aws and gcp for cl, mysam in mysams.items(): url = drsClient.getAccessURL(row[1], cl) # Step 3 - Run a pipeline on the file at the drs url creditor.creditClass(mysam) task_id = mysam.runWorkflow(url, outfile) via = 'py' note = 'double submit' time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S") faspRunner.logRun(time, via, note, task_id, outfile, str(fileSize), searchClient, drsClient, mysam) creditor.creditFromList('FASPScript8_sdrf', closeImage=False)
def __init__(self, debug=False): self.drsClients = { "insdc": sdlDRSClient('~/.keys/prj_11218_D17199.ngc'), "crdc": crdcDRSClient('~/.keys/crdc_credentials.json','s3'), "bdc": bdcDRSClient('~/.keys/bdc_credentials.json','gs'), "anv": anvilDRSClient('~/.keys/anvil_credentials.json', '', 'gs'), "insdc": sdlDRSClient('~/.keys/prj_11218_D17199.ngc'), "sbcgc": sbcgcDRSClient('~/.keys/sevenbridges_keys.json','s3'), "sbcav": cavaticaDRSClient('~/.keys/sevenbridges_keys.json','s3'), "srapub": DRSClient('https://locate.ncbi.nlm.nih.gov', debug=False) } self.registeredClients = [] self.hostNameIndex = {} self.debug = debug
def DRSClientFromRegistryEntry(self, service, prefix): if prefix == "crdc": drsClient = crdcDRSClient('~/.keys/crdc_credentials.json','s3') elif prefix == "bdc": drsClient = bdcDRSClient('~/.keys/bdc_credentials.json','gs') elif prefix == "insdc": drsClient = sdlDRSClient('~/.keys/prj_11218_D17199.ngc') elif prefix == "sbcgc": drsClient = sbcgcDRSClient('~/.keys/sevenbridges_keys.json','s3') elif prefix == "sbcav": drsClient = cavaticaDRSClient('~/.keys/sevenbridges_keys.json','s3') else: drsClient = DRSClient.fromRegistryEntry(service) return drsClient
def __init__(self, debug=False, getReg=True): self.drsClients = { "insdc": sdlDRSClient('~/.keys/prj_11218_D17199.ngc'), "crdc": crdcDRSClient('~/.keys/crdc_credentials.json', 's3'), "bdc": bdcDRSClient('~/.keys/bdc_credentials.json', 'gs'), "anv": anvilDRSClient('~/.keys/anvil_credentials.json', '', 'gs'), "insdc": sdlDRSClient('~/.keys/prj_11218_D17199.ngc'), "sbcgc": sbcgcDRSClient('~/.keys/sevenbridges_keys.json', 's3'), "sbcav": cavaticaDRSClient('~/.keys/sevenbridges_keys.json', 'gs'), 'sbbdc': sbbdcDRSClient('~/.keys/sevenbridges_keys.json', 's3'), "sradrs": SRADRSClient('https://locate.be-md.ncbi.nlm.nih.gov') } self.registeredClients = [] self.hostNameIndex = {} self.debug = debug if getReg: self.getRegisteredDRSServices()
def main(argv): faspRunner = FASPRunner(pauseSecs=0) creditor = faspRunner.creditor settings = faspRunner.settings # Step 1 - Discovery # query for relevant DRS objects searchClient = BigQuerySearchClient() # TCGA Query - CRDC crdcquery = """ SELECT 'case_'||associated_entities__case_gdc_id , 'crdc:'||file_id FROM `isb-cgc.GDC_metadata.rel24_fileData_active` where data_format = 'BAM' and project_disease_type = 'Breast Invasive Carcinoma' limit 3""" #COPD query - Topmed bdcquery = """ SELECT SUBJECT_ID, 'bdc:'||read_drs_id FROM `isbcgc-216220.COPDGene.phenotype_drs` where Weight_KG between 92.5 and 93.0 LIMIT 3""" results = searchClient.runQuery(crdcquery) # Send the query creditor.creditFromList('ISBGDCData') results += searchClient.runQuery(bdcquery) creditor.creditFromList('BDCData') # Step 2 - DRS - set up DRS Clients drsClients = { "crdc": crdcDRSClient('~/.keys/crdc_credentials.json', ''), "bdc": bdcDRSClient('~/.keys/bdc_credentials.json', '') } print('setting credentials ') creditor.creditFromList('dbGaPFence') # Step 3 - set up a class that runs samtools for us # providing the location for the results location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion']) mysam = GCPLSsamtools(location, settings['GCPOutputBucket']) # repeat steps 2 and 3 for each row of the query for row in results: print("subject={}, drsID={}".format(row[0], row[1])) # Step 2 - Use DRS to get the URL # get the prefix prefix, drsid = row[1].split(":", 1) url = drsClients[prefix].getAccessURL(drsid, 'gs') drsClient = drsClients[prefix] creditor.creditClass(drsClient) objInfo = drsClient.getObject(drsid) fileSize = objInfo['size'] # Step 3 - Run a pipeline on the file at the drs url outfile = "{}.txt".format(row[0]) mysam.runWorkflow(url, outfile) creditor.creditClass(mysam) via = 'sh' pipeline_id = 'paste here' note = 'Two sources' time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S") faspRunner.logRun(time, via, note, pipeline_id, outfile, fileSize, searchClient, drsClient, mysam) creditor.creditFromList('FASPScript2_sdrf', closeImage=False)
''' Query to illustrate Anne's use case for variants related to a gene involved in a rare pediatric brain cancer''' # IMPORTS import sys from fasp.runner import FASPRunner # The implementations we're using from fasp.loc import crdcDRSClient from fasp.workflow import GCPLSsamtools from fasp.search import BigQuerySearchClient faspRunner = FASPRunner(pauseSecs=0) settings = faspRunner.settings searchClient = BigQuerySearchClient() drsClient = crdcDRSClient('~/.keys/crdc_credentials.json', 'gs') location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion']) mysam = GCPLSsamtools(location, settings['GCPOutputBucket']) faspRunner.configure(searchClient, drsClient, mysam) query = """ SELECT mut.case_barcode subject, meta.file_gdc_id as drs_id, meta.file_gdc_url as tumor_bam_file_path, clin.race, clin.age_at_diagnosis, clin.ethnicity FROM `isb-cgc.TCGA_hg38_data_v0.Somatic_Mutation` as mut join `isb-cgc.TCGA_bioclin_v0.Clinical` as clin on clin.case_barcode = mut.case_barcode join `isb-cgc.GDC_metadata.rel24_GDCfileID_to_GCSurl` as meta