예제 #1
0
def main(argv):

    faspRunner = FASPRunner(pauseSecs=0)

    # Step 1 - Discovery
    # query for relevant DRS objects
    searchClient = BigQuerySearchClient()

    query = """
     	SELECT 'case_'||associated_entities__case_gdc_id , file_id
		FROM `isb-cgc.GDC_metadata.rel24_fileData_active` 
		where data_format = 'BAM' 
		and project_disease_type = 'Breast Invasive Carcinoma'
		limit 3"""

    drsClient = crdcDRSClient('~/.keys/crdc_credentials.json', 's3')

    # Step 3 - set up a class that runs samtools for us
    sbProject = faspRunner.settings['SevenBridgesProject']
    sbInst = faspRunner.settings['SevenBridgesInstance']
    mysam = samtoolsSBClient(sbInst, sbProject)

    faspRunner.configure(searchClient, drsClient, mysam)

    faspRunner.runQuery(query, 'GDC query SB compute')
예제 #2
0
def main(argv):

    faspRunner = FASPRunner(pauseSecs=0)
    creditor = faspRunner.creditor
    settings = faspRunner.settings

    # set your Seven Bridges CGC project using what you have put in FASP Settings
    sbProject = settings['SevenBridgesProject']
    sbInstance = settings['SevenBridgesInstance']

    # Step 1 - Discovery
    # query for relevant DRS objects
    searchClient = BigQuerySearchClient()

    # Step 2 - DRS - set up a DRS Client
    drsClient = crdcDRSClient('~/.keys/crdc_credentials.json', 's3')

    # Step 3 - set up a class that runs samtools for us
    location = 'projects/{}/locations/{}'.format(settings['GCPProject'],
                                                 settings['GCPPipelineRegion'])
    sam2 = GCPLSsamtools(location, settings['GCPOutputBucket'])
    mysams = {'s3': samtoolsSBClient(sbInstance, sbProject), 'gs': sam2}

    query = """
     	SELECT 'case_'||associated_entities__case_gdc_id , file_id
		FROM `isb-cgc.GDC_metadata.rel24_fileData_active` 
		where data_format = 'BAM' 
		and project_disease_type = 'Breast Invasive Carcinoma'
		limit 3"""
    print(query)

    query_job = searchClient.runQuery(query)  # Send the query
    creditor.creditFromList('ISBGDCData')

    # repeat steps 2 and 3 for each row of the query

    for row in query_job:

        print("subject={}, drsID={}".format(row[0], row[1]))

        # Step 2 - Use DRS to get the URL
        objInfo = drsClient.getObject(row[1])
        creditor.creditClass(drsClient)
        fileSize = objInfo['size']
        outfile = "{}.txt".format(row[0])
        # submit to both aws and gcp
        for cl, mysam in mysams.items():
            url = drsClient.getAccessURL(row[1], cl)
            # Step 3 - Run a pipeline on the file at the drs url

            creditor.creditClass(mysam)
            task_id = mysam.runWorkflow(url, outfile)
            via = 'py'
            note = 'double submit'

            time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
            faspRunner.logRun(time, via, note, task_id, outfile, str(fileSize),
                              searchClient, drsClient, mysam)

    creditor.creditFromList('FASPScript8_sdrf', closeImage=False)
예제 #3
0
	def __init__(self, debug=False):
		self.drsClients = { 
			"insdc": sdlDRSClient('~/.keys/prj_11218_D17199.ngc'),
			"crdc": crdcDRSClient('~/.keys/crdc_credentials.json','s3'),
			"bdc": bdcDRSClient('~/.keys/bdc_credentials.json','gs'),
			"anv": anvilDRSClient('~/.keys/anvil_credentials.json', '', 'gs'),
			"insdc": sdlDRSClient('~/.keys/prj_11218_D17199.ngc'),
			"sbcgc": sbcgcDRSClient('~/.keys/sevenbridges_keys.json','s3'),
			"sbcav": cavaticaDRSClient('~/.keys/sevenbridges_keys.json','s3'),
			"srapub": DRSClient('https://locate.ncbi.nlm.nih.gov', debug=False)
		}
		self.registeredClients = []
		self.hostNameIndex = {}
		self.debug = debug
예제 #4
0
	def DRSClientFromRegistryEntry(self, service, prefix):
		
			if prefix == "crdc": 
				drsClient = crdcDRSClient('~/.keys/crdc_credentials.json','s3')
			elif prefix == "bdc": 
				drsClient = bdcDRSClient('~/.keys/bdc_credentials.json','gs')
			elif prefix == "insdc": 
				drsClient = sdlDRSClient('~/.keys/prj_11218_D17199.ngc')
			elif prefix == "sbcgc": 
				drsClient = sbcgcDRSClient('~/.keys/sevenbridges_keys.json','s3')
			elif prefix == "sbcav": 
				drsClient = cavaticaDRSClient('~/.keys/sevenbridges_keys.json','s3')
			else: 
				drsClient = DRSClient.fromRegistryEntry(service)
			return drsClient
예제 #5
0
    def __init__(self, debug=False, getReg=True):
        self.drsClients = {
            "insdc": sdlDRSClient('~/.keys/prj_11218_D17199.ngc'),
            "crdc": crdcDRSClient('~/.keys/crdc_credentials.json', 's3'),
            "bdc": bdcDRSClient('~/.keys/bdc_credentials.json', 'gs'),
            "anv": anvilDRSClient('~/.keys/anvil_credentials.json', '', 'gs'),
            "insdc": sdlDRSClient('~/.keys/prj_11218_D17199.ngc'),
            "sbcgc": sbcgcDRSClient('~/.keys/sevenbridges_keys.json', 's3'),
            "sbcav": cavaticaDRSClient('~/.keys/sevenbridges_keys.json', 'gs'),
            'sbbdc': sbbdcDRSClient('~/.keys/sevenbridges_keys.json', 's3'),
            "sradrs": SRADRSClient('https://locate.be-md.ncbi.nlm.nih.gov')
        }
        self.registeredClients = []
        self.hostNameIndex = {}
        self.debug = debug

        if getReg: self.getRegisteredDRSServices()
예제 #6
0
def main(argv):

	
	faspRunner = FASPRunner(pauseSecs=0)
	creditor = faspRunner.creditor
	settings = faspRunner.settings
	
	# Step 1 - Discovery
	# query for relevant DRS objects
	searchClient = BigQuerySearchClient()

	# TCGA Query - CRDC
	crdcquery = """
     	SELECT 'case_'||associated_entities__case_gdc_id , 'crdc:'||file_id
		FROM `isb-cgc.GDC_metadata.rel24_fileData_active` 
		where data_format = 'BAM' 
		and project_disease_type = 'Breast Invasive Carcinoma'
		limit 3"""
	
	#COPD query - Topmed	
	bdcquery = """
  		SELECT SUBJECT_ID, 'bdc:'||read_drs_id
  		FROM `isbcgc-216220.COPDGene.phenotype_drs`
      	where Weight_KG between 92.5 and 93.0
      	LIMIT 3"""
  		
	results = searchClient.runQuery(crdcquery)  # Send the query
	creditor.creditFromList('ISBGDCData')
	results += searchClient.runQuery(bdcquery)  
	creditor.creditFromList('BDCData')

	# Step 2 - DRS - set up DRS Clients	
	drsClients = {
		"crdc": crdcDRSClient('~/.keys/crdc_credentials.json', ''),
		"bdc": bdcDRSClient('~/.keys/bdc_credentials.json', '')
	}
	print('setting credentials ')
	creditor.creditFromList('dbGaPFence')
	
	# Step 3 - set up a class that runs samtools for us
	# providing the location for the results
	location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion'])
	mysam = GCPLSsamtools(location, settings['GCPOutputBucket'])
	
	# repeat steps 2 and 3 for each row of the query
	for row in results:

		print("subject={}, drsID={}".format(row[0], row[1]))
		
		# Step 2 - Use DRS to get the URL
		# get the prefix
		prefix, drsid = row[1].split(":", 1)
		url = drsClients[prefix].getAccessURL(drsid, 'gs')
		drsClient = drsClients[prefix]
		creditor.creditClass(drsClient)
		objInfo = drsClient.getObject(drsid)
		fileSize = objInfo['size']
				
		# Step 3 - Run a pipeline on the file at the drs url
		outfile = "{}.txt".format(row[0])
		mysam.runWorkflow(url, outfile)
		creditor.creditClass(mysam)
		via = 'sh'
		pipeline_id = 'paste here'
		note = 'Two sources'
		time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
		faspRunner.logRun(time, via, note,  pipeline_id, outfile, fileSize,
			searchClient, drsClient, mysam)
			
	creditor.creditFromList('FASPScript2_sdrf', closeImage=False)
예제 #7
0
''' Query to illustrate Anne's use case for variants related to a gene involved in a rare pediatric brain cancer'''
#  IMPORTS
import sys

from fasp.runner import FASPRunner

# The implementations we're using
from fasp.loc import crdcDRSClient
from fasp.workflow import GCPLSsamtools
from fasp.search import BigQuerySearchClient

faspRunner = FASPRunner(pauseSecs=0)
settings = faspRunner.settings

searchClient = BigQuerySearchClient()
drsClient = crdcDRSClient('~/.keys/crdc_credentials.json', 'gs')
location = 'projects/{}/locations/{}'.format(settings['GCPProject'],
                                             settings['GCPPipelineRegion'])
mysam = GCPLSsamtools(location, settings['GCPOutputBucket'])

faspRunner.configure(searchClient, drsClient, mysam)

query = """
SELECT mut.case_barcode subject, meta.file_gdc_id as drs_id, 
meta.file_gdc_url as tumor_bam_file_path,
clin.race, clin.age_at_diagnosis, clin.ethnicity
  
FROM `isb-cgc.TCGA_hg38_data_v0.Somatic_Mutation` as mut 
join `isb-cgc.TCGA_bioclin_v0.Clinical` as clin 
on clin.case_barcode = mut.case_barcode 
join `isb-cgc.GDC_metadata.rel24_GDCfileID_to_GCSurl` as meta