Exemplo n.º 1
0
def main(argv):

    faspRunner = FASPRunner(pauseSecs=0)
    creditor = faspRunner.creditor
    settings = faspRunner.settings

    # set your Seven Bridges CGC project using what you have put in FASP Settings
    sbProject = settings['SevenBridgesProject']
    sbInstance = settings['SevenBridgesInstance']

    # Step 1 - Discovery
    # query for relevant DRS objects
    searchClient = BigQuerySearchClient()

    # Step 2 - DRS - set up a DRS Client
    drsClient = crdcDRSClient('~/.keys/crdc_credentials.json', 's3')

    # Step 3 - set up a class that runs samtools for us
    location = 'projects/{}/locations/{}'.format(settings['GCPProject'],
                                                 settings['GCPPipelineRegion'])
    sam2 = GCPLSsamtools(location, settings['GCPOutputBucket'])
    mysams = {'s3': samtoolsSBClient(sbInstance, sbProject), 'gs': sam2}

    query = """
     	SELECT 'case_'||associated_entities__case_gdc_id , file_id
		FROM `isb-cgc.GDC_metadata.rel24_fileData_active` 
		where data_format = 'BAM' 
		and project_disease_type = 'Breast Invasive Carcinoma'
		limit 3"""
    print(query)

    query_job = searchClient.runQuery(query)  # Send the query
    creditor.creditFromList('ISBGDCData')

    # repeat steps 2 and 3 for each row of the query

    for row in query_job:

        print("subject={}, drsID={}".format(row[0], row[1]))

        # Step 2 - Use DRS to get the URL
        objInfo = drsClient.getObject(row[1])
        creditor.creditClass(drsClient)
        fileSize = objInfo['size']
        outfile = "{}.txt".format(row[0])
        # submit to both aws and gcp
        for cl, mysam in mysams.items():
            url = drsClient.getAccessURL(row[1], cl)
            # Step 3 - Run a pipeline on the file at the drs url

            creditor.creditClass(mysam)
            task_id = mysam.runWorkflow(url, outfile)
            via = 'py'
            note = 'double submit'

            time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
            faspRunner.logRun(time, via, note, task_id, outfile, str(fileSize),
                              searchClient, drsClient, mysam)

    creditor.creditFromList('FASPScript8_sdrf', closeImage=False)
Exemplo n.º 2
0
def main(argv):

    faspRunner = FASPRunner(pauseSecs=0)
    creditor = faspRunner.creditor

    # Step 1 - Discovery
    # query for relevant DRS objects
    searchClient = DiscoverySearchClient(
        'https://ga4gh-search-adapter-presto-public.prod.dnastack.com/',
        debug=False)

    query = "SELECT file_name, compact_drs_id, hostbased_drs_id, drs_id from thousand_genomes.onek_genomes.onek_recal_variants_drs where chromosome = 'chr21' and annotated = false"
    print(query)

    query_job = searchClient.runQuery(query)  # Send the query
    creditor.creditClass(searchClient)

    # Step 2 - DRS - use the MetaResolver send drs ids to the right service
    drsResolver = DRSMetaResolver(getReg=False)

    # Step 3 - set up a class that run a compute for us
    wesClient = DNAStackWESClient('~/.keys/dnastack_wes_credentials.json')

    # repeat steps 2 and 3 for each row of the query
    # this example should find id's for the same file in both BioDataCatalyst and Anvil
    for row in query_job:
        drs_id = row[1]
        print("vcffile={}, compact drsID={}".format(row[0], drs_id))

        # Step 2 - Use DRS to get the URL
        objInfo = drsResolver.getObject(drs_id)
        drsClient, localid = drsResolver.getClient(drs_id)
        print(drsClient)
        creditor.creditClass(drsClient)
        fileSize = objInfo['size']

        vcfurl = drsResolver.getAccessURL(drs_id, 'gs')
        # Step 3 - Run a pipeline on the file at the drs url
        pipeline_id = wesClient.runGWASWorkflow(
            vcfurl, 'gs://dnastack-public-bucket/thousand_genomes_meta.csv')
        creditor.creditClass(wesClient)
        print('submitted:{}'.format(pipeline_id))

        outfile = ''
        via = 'WES'
        note = 'GWAS'

        time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
        faspRunner.logRun(time, via, note, pipeline_id, outfile, str(fileSize),
                          searchClient, drsClient, wesClient)
Exemplo n.º 3
0
def main(argv):

    faspRunner = FASPRunner()
    settings = faspRunner.settings
    logTable = pd.read_table(faspRunner.pipelineLogFile, dtype={'status': str})
    sbSystem = settings['SevenBridgesInstance']
    sbProject = settings['SevenBridgesProject']

    location = 'projects/{}/locations/{}'.format(settings['GCPProject'],
                                                 settings['GCPPipelineRegion'])
    gcsam = GCPLSsamtools(location, settings['GCPOutputBucket'])
    wesClients = {
        'samtoolsSBClient': samtoolsSBClient(sbSystem, sbProject),
        'DNAStackWESClient': DNAStackWESClient('~/.keys/DNAStackWESkey.json'),
        'GCPLSsamtools': gcsam
    }

    for i, row in logTable.iterrows():
        wesClientClassName = row["wesClient"]
        run_id = row["pipeline_id"]
        if run_id == 'paste here':
            logTable.at[i, 'status'] = 0
        else:
            if pd.isna(row["status"]) or row["status"].lower() == 'running':
                wc = wesClients[wesClientClassName]
                status = wc.getTaskStatus(row["pipeline_id"])
                print('Updated run:{} status:{}'.format(run_id, status))
                logTable.at[i, 'status'] = status

    #logTable.to_csv('pipeline_w_status.txt', sep='\t', index=False)
    logTable.to_csv(faspRunner.pipelineLogFile, sep='\t', index=False)
Exemplo n.º 4
0
def main(argv):

    # Step 1 - Discovery
    # query for relevant DRS objects
    searchClient = DiscoverySearchClient(
        'https://ga4gh-search-adapter-presto-public.prod.dnastack.com/')
    query = "select submitter_id, read_drs_id drsid from thousand_genomes.onek_genomes.ssd_drs where population = 'ACB' limit 1"
    query_job = searchClient.runQuery(query)

    # Step 2 - DRS - set up a DRS Client
    # CRDC
    drsClient = sdlDRSClient('~/.keys/prj_11218_D17199.ngc')

    # Step 3 - set up a class that run a compute for us
    wesClient = DNAStackWESClient('~/.keys/DNAStackWESkey.json')

    # A log is helpful to keep track of the computes we've submitted
    faspRunner = FASPRunner()

    # repeat steps 2 and 3 for each row of the query
    for row in query_job:

        print("subject={}, drsID={}".format(row[0], row[1]))

        # Step 2 - Use DRS to get the URL
        #objInfo = drsClient.getObject(row[1])
        # for testing
        acc = 'SRR5368359.sra'
        objInfo = drsClient.getObject(acc)
        fileSize = objInfo['size']
        print(fileSize)
        # we've predetermined we want to use the gs copy in this case
        #url = drsClient.getAccessURL(row[1], 'gs')
        res = drsClient.getAccessURL(acc, 'gs.us')
        url = res['url']
        print(url)
        # Step 3 - Run a pipeline on the file at the drs url
        outfile = "{}.txt".format(row[0])
        pipeline_id = wesClient.runWorkflow(url, outfile)
        print('submitted:{}'.format(pipeline_id))

        via = 'WES'
        note = 'WES MD5 on NCBI SDL'

        time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
        faspRunner.logRun(time, via, note, pipeline_id, outfile, str(fileSize),
                          searchClient, drsClient, wesClient)
Exemplo n.º 5
0
def main(argv):

	faspRunner = FASPRunner(pauseSecs=0)
	
	# Step 1 - Discovery
	# query for relevant DRS objects
	searchClient = BigQuerySearchClient()
	query = """SELECT sra.biosample, sra.acc||'.cram'
		FROM `isbcgc-216220.GECCO_CRC_Susceptibility.Subject_Phenotypes` sp
		join `isbcgc-216220.GECCO_CRC_Susceptibility.Sample_MULTI` sm on
		sm.dbgap_subject_id = sp.dbgap_subject_id
		join `nih-sra-datastore.sra.metadata` sra on sm.BioSample_Accession = sra.biosample
		where AGE between 45 and 55 and sex = 'Female' limit 3"""
	query_job = searchClient.runQuery(query)
	
	# Step 2 - DRS - set up a DRS Client
	# CRDC
	drsClient = sdlDRSClient('~/.keys/prj_14565.ngc', True)
	
	# Step 3 - set up a class that run a compute for us
	wesClient = DNAStackWESClient('~/.keys/dnastack_wes_credentials.json')
	
	# repeat steps 2 and 3 for each row of the query
	for row in query_job:

		print("sample={}, drsID={}".format(row[0], row[1]))
		
		# Step 2 - Use DRS to get the URL
		objInfo = drsClient.getObject(row[1])
		fileSize = objInfo['size']
		print(fileSize)
		# we've predetermined we want to use the gs copy in this case
		#url = drsClient.getAccessURL(row[1], 'gs')
		res = drsClient.getAccessURL(row[1],'gs.us')
		url = res['url']
		print(url)
		# Step 3 - Run a pipeline on the file at the drs url
		outfile = "{}.txt".format(row[0])
		pipeline_id = wesClient.runWorkflow(url, outfile)
		print('submitted:{}'.format(pipeline_id))
		
		via = 'WES'
		note = 'WES MD5 on NCBI SDL'

		time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
		faspRunner.logRun(time, via, note,  pipeline_id, outfile, str(fileSize),
			searchClient, drsClient, wesClient)
Exemplo n.º 6
0
def main(argv):

    faspRunner = FASPRunner(pauseSecs=0)
    creditor = faspRunner.creditor
    settings = faspRunner.settings
	
	# Step 1 - Discovery
    # query for relevant files
    searchClient = DiscoverySearchClient('https://ga4gh-search-adapter-presto-public.prod.dnastack.com/')
    query = "SELECT sample_submitter_id, fileid, filename FROM dbgap_demo.scr_ega.scr_egapancreatic_sample_multi p join dbgap_demo.scr_ega.scr_egapancreatic_files f on f.sample_primary_id = p.sample_primary_id where phenotype = 'pancreatic adenocarcinoma' limit 3"
    query_job = searchClient.runQuery(query)
    
    # Step 2 - Use htsget at EGA
    htsgetClient = EGAhtsget('~/.keys/ega.credentials')
    
    # Step 3 - set up a class that run a compute for us
    location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion'])
    wesClient = GCPLSsamtools(location, settings['GCPOutputBucket'])


    
    # repeat steps 2 and 3 for each row of the query
    for row in query_job:

        print("sample={}, EGAFileID={}".format(row[0], row[1]))
        
        # Step 2 - Use DRS to get the URL
        fileSize = htsgetClient.getSize(row[1])
        print(fileSize)
        # we've predetermined we want to use the gs copy in this case
        #url = drsClient.getAccessURL(row[1], 'gs')
        #htsgetClient.htsget(row[1], 'chr1', 100000, 102000, 'BAM', row[2])
		
        localfile = 'NA19377.unmapped.ILLUMINA.bwa.LWK.low_coverage.20120522.bam'
		#row[2]
        # Step 3 - Run a pipeline on the file at the drs url
        outfile = "{}.txt".format(row[0])
        pipeline_id = wesClient.runWorkflow(localfile, outfile)
        #print('submitted:{}'.format(pipeline_id))
        
        via = 'local'
        note = 'samtools on htsget BAM'

        time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
        faspRunner.logRun(time, via, note,  pipeline_id, outfile, str(fileSize),
            searchClient, htsgetClient, wesClient)
Exemplo n.º 7
0
def main(argv):

    faspRunner = FASPRunner(pauseSecs=0)
    settings = faspRunner.settings
    # Step 1 - Discovery
    # query for relevant DRS objects
    searchClient = BigQuerySearchClient()
    # 	query = """
    #      	SELECT subject_id, read_drs_id
    #      	FROM `isbcgc-216220.COPDGene.phenotype_drs`
    #      	where weight_kg between 91.8 and 93.0
    #      	LIMIT 1"""
    query = """
		SELECT submitter_id, read_drs_id
		FROM `isbcgc-216220.onek_genomes.ssd_drs`
		where population = 'BEB'
		LIMIT 1"""

    # BioDataCatalyst
    drsClient = bdcDRSClient('~/.keys/bdc_credentials.json', 'gs')
    location = 'projects/{}/locations/{}'.format(settings['GCPProject'],
                                                 settings['GCPPipelineRegion'])
    mysam = GCPLSsamtools(location, settings['GCPOutputBucket'])

    faspRunner.configure(searchClient, drsClient, mysam)

    faspRunner.runQuery(query, 'One k  query ')
Exemplo n.º 8
0
def main(argv):


	faspRunner = FASPRunner(pauseSecs=0)
	settings = faspRunner.settings
	# Step 1 - Discovery
	# query for relevant DRS objects
	searchClient = BigQuerySearchClient()

	query = """
		SELECT s.sample_name, drs_id, s.acc, assay_type, filename, 
		FROM `nih-sra-datastore.sra.metadata` s, unnest(attributes) att
		join `isbcgc-216220.onek_genomes.sra_drs_files` d on d.acc = s.acc
		where filetype = 'bam' and mapped = 'mapped' and sequencing_type ='exome'
		and att.k = 'population_sam' and att.v = 'JPT' 
		LIMIT 3"""

	#drsClient = DRSMetaResolver()
	drsClient = DRSClient('https://locate.ncbi.nlm.nih.gov',access_id='2', public=True)
	location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion'])
	mysam = GCPLSsamtools(location, settings['GCPOutputBucket'])

	faspRunner.configure(searchClient, drsClient, mysam)
		
	faspRunner.runQuery(query, 'One k query SRA DRS')
Exemplo n.º 9
0
def main(argv):

    faspRunner = FASPRunner(pauseSecs=0)

    # Step 1 - Discovery
    # query for relevant DRS objects
    searchClient = BigQuerySearchClient()

    query = """
     	SELECT 'case_'||associated_entities__case_gdc_id , file_id
		FROM `isb-cgc.GDC_metadata.rel24_fileData_active` 
		where data_format = 'BAM' 
		and project_disease_type = 'Breast Invasive Carcinoma'
		limit 3"""

    drsClient = crdcDRSClient('~/.keys/crdc_credentials.json', 's3')

    # Step 3 - set up a class that runs samtools for us
    sbProject = faspRunner.settings['SevenBridgesProject']
    sbInst = faspRunner.settings['SevenBridgesInstance']
    mysam = samtoolsSBClient(sbInst, sbProject)

    faspRunner.configure(searchClient, drsClient, mysam)

    faspRunner.runQuery(query, 'GDC query SB compute')
Exemplo n.º 10
0
def main(argv):

    # edit the following line for where you put your ngc credentials file from dbGaP
    credentials_file = '~/.keys/prj_14565.ngc'

    faspRunner = FASPRunner(pauseSecs=0)
    settings = faspRunner.settings
    # Step 1 - Discovery
    # query for relevant DRS objects
    searchClient = localSearchClient()
    query_job = searchClient.runQuery('')

    drsClient = drsClient = sdlDRSClient(credentials_file, debug=True)

    location = 'projects/{}/locations/{}'.format(settings['GCPProject'],
                                                 settings['GCPPipelineRegion'])
    mysam = GCPLSsamtools(location, settings['GCPOutputBucket'], debug=True)

    faspRunner = FASPRunner()

    # repeat steps 2 and 3 for each row of the query
    for row in query_job:

        print("subject={}, drsID={}".format(row[0], row[1]))

        # Step 2 - Use DRS to get the URL
        #objInfo = drsClient.getObject(row[1])
        # for testing
        objInfo = drsClient.getObject(row[1])
        fileSize = objInfo['size']
        print(fileSize)
        # we've predetermined we want to use the gs copy in this case
        #url = drsClient.getAccessURL(row[1], 'gs')
        res = drsClient.getAccessURL(row[1], 'gs.us')
        url = res['url']
        print(url)
        # Step 3 - Run a pipeline on the file at the drs url
        outfile = "{}.txt".format(row[0])
        pipeline_id = mysam.runWorkflow(url, outfile)
        print('submitted:{}'.format(pipeline_id))

        via = ''
        note = 'Anvil GTEX Test via SDL'

        time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
        faspRunner.logRun(time, via, note, pipeline_id, outfile, str(fileSize),
                          searchClient, drsClient, mysam)
Exemplo n.º 11
0
def main(argv):


	faspRunner = FASPRunner(pauseSecs=0)
	settings = faspRunner.settings
	# Step 1 - Discovery
	# query for relevant DRS objects
	searchClient = DiscoverySearchClient('https://ga4gh-search-adapter-presto-public.prod.dnastack.com/', debug=True)

	query = "SELECT s.su_submitter_id, drs_id FROM thousand_genomes.onek_genomes.ssd_drs s join thousand_genomes.onek_genomes.sra_drs_files f on f.sample_name = s.su_submitter_id where filetype = 'bam' and mapped = 'mapped' and sequencing_type ='exome' and  population = 'JPT' LIMIT 3"

	drsClient = DRSClient('https://locate.ncbi.nlm.nih.gov',access_id='2' ,debug=True, public=True)
	location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion'])
	mysam = GCPLSsamtools(location, settings['GCPOutputBucket'])

	faspRunner.configure(searchClient, drsClient, mysam)
		
	faspRunner.runQuery(query, 'One k query SRA DRS')
Exemplo n.º 12
0
def main(argv):

    faspRunner = FASPRunner()
    settings = faspRunner.settings

    searchClient = Gen3ManifestClient(
        './fasp/data/gtex/gtex-cram-manifest.json')

    drsClient = anvilDRSClient('~/.keys/anvil_credentials.json',
                               access_id='s3')

    wesClient = sbWESClient(settings['SevenBridgesInstance'],
                            settings['SevenBridgesProject'],
                            '~/.keys/sbcgc_key.json')

    faspRunner.configure(searchClient, drsClient, wesClient)

    faspRunner.runQuery(3, 'Anvil GTEX Test')
Exemplo n.º 13
0
def main(argv):

    faspRunner = FASPRunner(pauseSecs=0)

    # Step 1 - Discovery
    # query for relevant DRS objects
    searchClient = DiscoverySearchClient(
        'https://ga4gh-search-adapter-presto-public.prod.dnastack.com/')
    query = "select submitter_id, read_drs_id drsid from thousand_genomes.onek_genomes.ssd_drs where population = 'ACB' limit 3"

    # Step 2 - DRS - set up a DRS Client
    drsClient = bdcDRSClient('~/.keys/bdc_credentials.json', 'gs')

    # Step 3 - set up a class that run a compute for us
    wesClient = DNAStackWESClient('~/.keys/dnastack_wes_credentials.json')

    faspRunner.configure(searchClient, drsClient, wesClient)

    faspRunner.runQuery(query, 'One k query using Search and WES')
Exemplo n.º 14
0
def main(argv):

    faspRunner = FASPRunner(pauseSecs=0)

    pp_dbgap_join = "SELECT sp.dbGaP_Subject_ID,  'sbcgc:'||sb_drs_id FROM dbgap_demo.scr_gecco_susceptibility.subject_phenotypes_multi sp join dbgap_demo.scr_gecco_susceptibility.sample_multi sm on sm.dbgap_subject_id = sp.dbgap_subject_id join dbgap_demo.scr_gecco_susceptibility.sb_drs_index di on di.sample_id = sm.sample_id join sample_phenopackets.ga4gh_tables.gecco_phenopackets pp on pp.id = sm.biosample_accession where  json_extract_scalar(pp.phenopacket, '$.subject.sex') = 'MALE' and file_type = 'cram' limit 3"

    # Step 1 - Discovery
    # query for relevant DRS objects
    searchClient = DiscoverySearchClient(
        'https://ga4gh-search-adapter-presto-public.prod.dnastack.com/',
        debug=True)

    # Step 2 - DRS - a metaresolver will deal with which drs server is required
    drsClient = DRSMetaResolver()

    # Step 3 - set up a class that run a compute for us
    wesClient = DNAStackWESClient('~/.keys/dnastack_wes_credentials.json')

    faspRunner.configure(searchClient, drsClient, wesClient)

    faspRunner.runQuery(pp_dbgap_join, 'Phenopacket Gecco')
Exemplo n.º 15
0
def main(argv):

    # edit the following line for where you put your credentials file from anvil
    credentials_file = '~/.keys/anvil_credentials.json'

    faspRunner = FASPRunner(pauseSecs=0)
    settings = faspRunner.settings
    # Step 1 - Discovery
    # query for relevant DRS objects
    searchClient = localSearchClient()

    #drsClient = DRSMetaResolver()

    drsClient = anvilDRSClient(credentials_file, settings['GCPProject'], 'gs')
    location = 'projects/{}/locations/{}'.format(settings['GCPProject'],
                                                 settings['GCPPipelineRegion'])
    workflowClient = GCPLSsamtools(location, settings['GCPOutputBucket'])

    faspRunner.configure(searchClient, drsClient, workflowClient)

    faspRunner.runQuery(12, 'Anvil GTEX Test')
Exemplo n.º 16
0
def main(argv):


	faspRunner = FASPRunner(pauseSecs=0)
	settings =faspRunner.settings
	# Step 1 - Discovery
	# query for relevant DRS objects
	searchClient = DiscoverySearchClient('https://ga4gh-search-adapter-presto-public.prod.dnastack.com/')
	query = "select submitter_id, read_drs_id drsid from thousand_genomes.onek_genomes.ssd_drs where population = 'BEB' limit 3"

	# Step 2 - DRS - set up a DRS Client
	# CRDC
	drsClient = bdcDRSClient('~/.keys/bdc_credentials.json', 'gs')

	
	# Step 3 - set up a class that runs samtools for us
	# providing the location where we the results to go
	location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion'])
	mysam = GCPLSsamtools(location, settings['GCPOutputBucket'])	

	faspRunner.configure(searchClient, drsClient, mysam)
		
	faspRunner.runQuery(query, 'One k query using Search')
Exemplo n.º 17
0
def main(argv):

    faspRunner = FASPRunner()
    settings = faspRunner.settings

    # Step 1 - Discovery
    # query for relevant DRS objects
    discoveryClients = {
        "crdc":
        BigQuerySearchClient(),
        "anv":
        Gen3ManifestClient('./fasp/data/gtex/gtex-cram-manifest_wCuries.json')
    }

    # TCGA Query - CRDC
    crdcquery = """
     	SELECT 'case_'||associated_entities__case_gdc_id , 'crdc:'||file_id
		FROM `isb-cgc.GDC_metadata.rel24_fileData_active` 
		where data_format = 'BAM' 
		and project_disease_type = 'Breast Invasive Carcinoma'
		limit 3"""

    # Run both queriues abd aggregate results
    results = discoveryClients['anv'].runQuery(3)  # Send the query
    results += discoveryClients['crdc'].runQuery(crdcquery)

    # Step 2 - DRS - set up DRS Clients
    # TODO Use DRSMetaresolver so we don't have to build our own resolver in this code
    drsClients = {
        "crdc":
        crdcDRSClient('~/.keys/crdc_credentials.json', 'gs'),
        "anv":
        anvilDRSClient('~/.keys/anvil_credentials.json',
                       settings['GCPProject'], 'gs')
    }

    # Step 3 - set up a class that runs samtools for us
    # providing the location for the results
    location = 'projects/{}/locations/{}'.format(settings['GCPProject'],
                                                 settings['GCPPipelineRegion'])
    wesClient = GCPLSsamtools(location, settings['GCPOutputBucket'])

    # repeat steps 2 and 3 for each row of the query
    for row in results:

        print("subject={}, drsID={}".format(row[0], row[1]))
        resRow = [row[0], row[1]]
        # Step 2 - Use DRS to get the URL
        # This is a local solution to resolve prefixed DRS ids, DRS Metarolver would be better
        # get the prefix

        prefix, drsid = row[1].split(":", 1)
        drsClient = drsClients[prefix]
        print('Sending id {} to {}'.format(drsid,
                                           drsClient.__class__.__name__))

        url = drsClient.getAccessURL(drsid)
        objInfo = drsClient.getObject(drsid)
        #print (objInfo)
        fileSize = objInfo['size']
        #fileSize = 0

        # Step 3 - Run a pipeline on the file at the drs url
        if url != None:
            outfile = "{}.txt".format(row[0])
            via = 'sh'
            note = 'GTEx and TCGA'
            time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
            run_id = wesClient.runWorkflow(url, outfile)
            searchClient = discoveryClients[prefix]
            faspRunner.logRun(time, via, note, run_id, outfile, fileSize,
                              searchClient, drsClient, wesClient)
            resRow.append('OK')
        else:
            print('could not get DRS url')
            resRow.append('unauthorized')
Exemplo n.º 18
0
''' Query to illustrate Anne's use case for variants related to a gene involved in a rare pediatric brain cancer'''
#  IMPORTS
import sys

from fasp.runner import FASPRunner

# The implementations we're using
from fasp.loc import crdcDRSClient
from fasp.workflow import GCPLSsamtools
from fasp.search import BigQuerySearchClient

faspRunner = FASPRunner(pauseSecs=0)
settings = faspRunner.settings

searchClient = BigQuerySearchClient()
drsClient = crdcDRSClient('~/.keys/crdc_credentials.json', 'gs')
location = 'projects/{}/locations/{}'.format(settings['GCPProject'],
                                             settings['GCPPipelineRegion'])
mysam = GCPLSsamtools(location, settings['GCPOutputBucket'])

faspRunner.configure(searchClient, drsClient, mysam)

query = """
SELECT mut.case_barcode subject, meta.file_gdc_id as drs_id, 
meta.file_gdc_url as tumor_bam_file_path,
clin.race, clin.age_at_diagnosis, clin.ethnicity
  
FROM `isb-cgc.TCGA_hg38_data_v0.Somatic_Mutation` as mut 
join `isb-cgc.TCGA_bioclin_v0.Clinical` as clin 
on clin.case_barcode = mut.case_barcode 
join `isb-cgc.GDC_metadata.rel24_GDCfileID_to_GCSurl` as meta 
Exemplo n.º 19
0
def main(argv):

	
	faspRunner = FASPRunner(pauseSecs=0)
	creditor = faspRunner.creditor
	settings = faspRunner.settings
	
	# set your Seven Bridges CGC project using what you have put in FASP Settings
	sbProject = settings['SevenBridgesProject']
	sbInstance = settings['SevenBridgesInstance']

	# Step 1 - Discovery
	# query for relevant DRS objects
	discoveryClients = {
		"sb": DiscoverySearchClient('https://ga4gh-search-adapter-presto-public.prod.dnastack.com/'),
		"bdc": BigQuerySearchClient()
	}

	crdcquery = "SELECT sp.dbGaP_Subject_ID,  'sb:'||sb_drs_id FROM dbgap_demo.scr_gecco_susceptibility.subject_phenotypes_multi sp join dbgap_demo.scr_gecco_susceptibility.sample_multi sm on sm.dbgap_subject_id = sp.dbgap_subject_id join dbgap_demo.scr_gecco_susceptibility.sb_drs_index di on di.sample_id = sm.sample_id where AGE between 45 and 55 and sex = 'Female' and file_type = 'cram' limit 3"
		


	bdcquery = """
		SELECT sp.dbGaP_Subject_ID,  'bdc:'||read_drs_id
		FROM `isbcgc-216220.COPDGene.Subject_MULTI` sm
		join `isbcgc-216220.COPDGene.Subject_Phenotypes_HMB` sp on sp.dbgap_subject_id = sm.dbgap_subject_id
		join `isbcgc-216220.COPDGene.COPD_DRS` drs on drs.su_submitter_id = sm.subject_id
 		where gender = '2'
 		and Age_Enroll between 45 and 55
 		LIMIT 3"""
		

	results = discoveryClients['sb'].runQuery(crdcquery)  # Send the query
	creditor.creditFromList('dbGapSSD')
	creditor.creditClass(discoveryClients['sb'])
	results += discoveryClients['bdc'].runQuery(bdcquery) 
	creditor.creditFromList('BDCData')
	

	# Step 2 - DRS - set up DRS Clients	
	drsClients = {
		"sb": sbcgcDRSClient('~/.keys/sevenbridges_keys.json', 's3'),
		"bdc": bdcDRSClient('~/.keys/bdc_credentials.json', 'gs')
	}
	print('setting credentials ')
	creditor.creditFromList('dbGaPFence')
		
	# Step 3 - set up a class that runs samtools for us
	# providing the location for the results
	location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion'])
	sam2 = GCPLSsamtools(location, settings['GCPOutputBucket'])
	samClients = {
		"sb": samtoolsSBClient(sbInstance, sbProject),
		"bdc": sam2
	}

	
	# repeat steps 2 and 3 for each row of the query
	for row in results:

		print("subject={}, drsID={}".format(row[0], row[1]))
		resRow = [row[0], row[1]]
		# Step 2 - Use DRS to get the URL
		# get the prefix
		prefix, drsid = row[1].split(":", 1)
		drsClient = drsClients[prefix]
		searchClient = discoveryClients[prefix]
		creditor.creditClass(drsClient)
		url = drsClient.getAccessURL(drsid)
		print(url)
		#objInfo = drsClient.getObject(drsid)
		#print (objInfo)
		#fileSize = objInfo['size']
		fileSize = 0
				
		# Step 3 - Run a pipeline on the file at the drs url
		if url != None:
			outfile = "{}.txt".format(row[0])
			mysam = samClients[prefix]
			creditor.creditClass(mysam)
			via = 'sh'
			note = 'Two dbGaP sources'
			time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
			run_id = mysam.runWorkflow(url, outfile)
			faspRunner.logRun(time, via, note,  run_id, outfile, fileSize,
				searchClient, drsClient, mysam)
			resRow.append('OK')
		else:
			print('could not get DRS url')
			resRow.append('unauthorized')
Exemplo n.º 20
0
def main(argv):

	
	faspRunner = FASPRunner(pauseSecs=0)
	creditor = faspRunner.creditor
	settings = faspRunner.settings
	
	# Step 1 - Discovery
	# query for relevant DRS objects
	searchClient = BigQuerySearchClient()

	# TCGA Query - CRDC
	crdcquery = """
     	SELECT 'case_'||associated_entities__case_gdc_id , 'crdc:'||file_id
		FROM `isb-cgc.GDC_metadata.rel24_fileData_active` 
		where data_format = 'BAM' 
		and project_disease_type = 'Breast Invasive Carcinoma'
		limit 3"""
	
	#COPD query - Topmed	
	bdcquery = """
  		SELECT SUBJECT_ID, 'bdc:'||read_drs_id
  		FROM `isbcgc-216220.COPDGene.phenotype_drs`
      	where Weight_KG between 92.5 and 93.0
      	LIMIT 3"""
  		
	results = searchClient.runQuery(crdcquery)  # Send the query
	creditor.creditFromList('ISBGDCData')
	results += searchClient.runQuery(bdcquery)  
	creditor.creditFromList('BDCData')

	# Step 2 - DRS - set up DRS Clients	
	drsClients = {
		"crdc": crdcDRSClient('~/.keys/crdc_credentials.json', ''),
		"bdc": bdcDRSClient('~/.keys/bdc_credentials.json', '')
	}
	print('setting credentials ')
	creditor.creditFromList('dbGaPFence')
	
	# Step 3 - set up a class that runs samtools for us
	# providing the location for the results
	location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion'])
	mysam = GCPLSsamtools(location, settings['GCPOutputBucket'])
	
	# repeat steps 2 and 3 for each row of the query
	for row in results:

		print("subject={}, drsID={}".format(row[0], row[1]))
		
		# Step 2 - Use DRS to get the URL
		# get the prefix
		prefix, drsid = row[1].split(":", 1)
		url = drsClients[prefix].getAccessURL(drsid, 'gs')
		drsClient = drsClients[prefix]
		creditor.creditClass(drsClient)
		objInfo = drsClient.getObject(drsid)
		fileSize = objInfo['size']
				
		# Step 3 - Run a pipeline on the file at the drs url
		outfile = "{}.txt".format(row[0])
		mysam.runWorkflow(url, outfile)
		creditor.creditClass(mysam)
		via = 'sh'
		pipeline_id = 'paste here'
		note = 'Two sources'
		time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
		faspRunner.logRun(time, via, note,  pipeline_id, outfile, fileSize,
			searchClient, drsClient, mysam)
			
	creditor.creditFromList('FASPScript2_sdrf', closeImage=False)