def get_ga4gh_subpops(baseURL): httpclient = client.HttpClient(baseURL) datasets = list(httpclient.search_datasets()) datasetId=datasets[0].id individuals = httpclient.search_individuals(datasetId) ancestry_dict = { i.name: i.description for i in individuals } return ancestry_dict
def runDemo(): httpClient = client.HttpClient("http://localhost:8000") iterator = httpClient.search_variants( "WyIxa2ctcDMtc3Vic2V0IiwidnMiLCJtdm5jYWxsIl0", reference_name="1", start=45000, end=50000) for variant in iterator: print( variant.reference_name, variant.start, variant.end, variant.reference_bases, variant.alternate_bases, sep="\t")
def setUp(self): self.httpClient = client.HttpClient("http://example.com") self.httpClient._run_search_request = mock.Mock() self.httpClient._run_get_request = mock.Mock() self.httpClient._run_list_request = mock.Mock() self.httpClient._run_get_request_path = mock.Mock() self.httpClient._run_post_request = mock.Mock() self.objectId = "SomeId" self.objectName = "objectName" self.datasetId = "datasetId" self.variantSetId = "variantSetId" self.variantAnnotationSetId = "variantAnnotationSetId" self.featureSetId = "featureSetId" self.continuousSetId = "continuousSetId" self.parentId = "parentId" self.feature = "feature" self.referenceSetId = "referenceSetId" self.referenceId = "referenceId" self.readGroupIds = ["readGroupId"] self.referenceName = "referenceName" self.biosampleId = "biosampleId" self.biosampleName = "biosampleName" self.individualName = "individualName" self.individualId = "individualId" self.geneSymbol = "geneSymbol" self.start = 100 self.end = 101 self.referenceName = "referenceName" self.callSetIds = ["id1", "id2"] self.pageSize = 1000 self.httpClient.set_page_size(self.pageSize) self.assemblyId = "assemblyId" self.accession = "accession" self.md5checksum = "md5checksum" self.phenotype_association_set_id = "phenotype_association_set_id" self.feature_ids = ["id1", "id2"] self.phenotype_ids = ["id3", "id4"] self.evidence = protocol.EvidenceQuery() self.rnaQuantificationSetId = "rnaQuantificationSetId" self.rnaQuantificationId = "rnaQuantificationId" self.expressionLevelId = "expressionLevelId" self.threshold = 0.0
def build_variant_alias_dictionary(gene): ga4gh_client = client.HttpClient(BRCA_GA4GH_URL) variant_aliases = {} variant_count = 0 for variant in ga4gh_client.search_variants(reference_name=gene.chromosome, variant_set_id="brca-hg37", start=gene.start-GENE_BUFFER, end=gene.end+GENE_BUFFER): variant_count += 1 for name in variant.names: variant_aliases[name]= variant.id if len(variant.info["HGVS_cDNA"]) > 1: print variant.info["HGVS_cDNA"][0], type(variant.info["HGVS_cDNA"][0]) sys.exit() if variant_count % 100 == 0: "Downloading BRCA variants:", variant_count, gene.name, "variants downloaded" variant_aliases[str(variant.info["HGVS_cDNA"][0])] = variant.id variant_aliases[str(variant.info["HGVS_Protein"][0])] = variant.id print "VARIANT COUNT IS: ", variant_count return variant_aliases
#!/usr/bin/python from ga4gh.client import client import json import ga4gh.client.protocol as protocol ga4gh_endpoint = "http://10.96.11.130:8000" c = client.HttpClient(ga4gh_endpoint) def harvest(genes): datasets = c.search_datasets() phenotype_association_set_id = None phenotype_association_set_name = None for dataset in datasets: phenotype_association_sets = c.search_phenotype_association_sets(dataset_id=dataset.id) for phenotype_association_set in phenotype_association_sets: phenotype_association_set_id = phenotype_association_set.id phenotype_association_set_name = phenotype_association_set.name # print 'Found G2P phenotype_association_set:', phenotype_association_set.id, phenotype_association_set.name break assert phenotype_association_set_id assert phenotype_association_set_name feature_set_id = None datasets = c.search_datasets() for dataset in datasets: featuresets = c.search_feature_sets(dataset_id=dataset.id) for featureset in featuresets: if phenotype_association_set_name in featureset.name: feature_set_id = featureset.id
import requests.packages.urllib3 import datetime from ga4gh.client import client from itertools import chain requests.packages.urllib3.disable_warnings() httpClient = client.HttpClient( "https://brcaexchange.org/backend/data/ga4gh/v0.6.0a7/") chrom = {"BRCA1": "chr17", "BRCA2": "chr13"} annotCols = ['id', 'Pathogenicity_all'] def brca_query(gene, start_pos, end_pos): query = httpClient.search_variants(reference_name=chrom[gene], variant_set_id="brca-hg37", start=int(start_pos), end=int(end_pos)) listOutput = [] for var in query: posInfo = [ var.info['Gene_Symbol'].values[0].string_value, var.reference_name, var.start, var.end, str(var.reference_bases), str(var.alternate_bases[0]) ] annotInfo = [ var.info[x].values[0].string_value if x in var.info.keys() else '' for x in annotCols
import json from ga4gh.client import client if __name__ == '__main__': # [1] Boilerplate code to initialize GA4GH client c = client.HttpClient("http://1kgenomes.ga4gh.org") dataset = c.search_datasets().next() # [2] Fetch variant set for variant_set in c.search_variant_sets(dataset_id=dataset.id): if variant_set.name == "phase3-release": var_set = variant_set # [3] Get metadata, store in dictionary metadata = {'data': []} for data in variant_set.metadata: if '.' in data.key: key, identity = (str(x) for x in data.key.split('.')) metadata['data'].append({ 'key': key, 'id': identity, 'number': data.number, 'type': data.type, 'description': data.description }) # [4] Write hardcoded metadata tags not on server metadata['data'].append({ 'key': 'FORMAT', 'id': 'GT',
from __future__ import print_function import ga4gh.client.client as client rahman_client = client.HttpClient("http://52.160.96.216/ga4gh") """ ******************** get datasets and variant_sets ******************* """ datasets = list(rahman_client.search_datasets()) dataset = rahman_client.get_dataset(datasets[0].id) release = None functional = None for variant_set in rahman_client.search_variant_sets(dataset_id=dataset.id): if variant_set.name == "phase3-release": release = variant_set else: functional = variant_set """ *********************************************************************** """ def main(): callsi = list(rahman_client.search_call_sets(functional.id)) variant_sets = list(rahman_client.search_variant_sets(dataset.id)) variant_set_id = variant_sets[0].id print(variant_set_id) call_set_ids = [] callsi = list(rahman_client.search_call_sets(functional.id)) for csi in callsi:
def get_ga4gh_variants_dataframe(url, chrom, start, end, results, snps_only): """ Returns a DataFrame of genotypes within the requested coordinates for all callsets. e.g. index HG00099 HG001031 0 10_94951137_94951138_C_A 0.0 0.0 1 10_94951708_94951709_C_T 0.0 0.0 2 11_89179334_89179335_T_C 0.0 0.0 3 11_89183935_89183936_G_A 0.0 0.0 4 11_89207230_89207231_T_A 0.0 0.0 5 11_89207617_89207618_T_A 0.0 0.0 6 11_89207714_89207715_C_A 0.0 0.0 7 11_89216311_89216312_A_C 0.0 0.0 8 11_89219122_89219123_T_A 0.0 0.0 (...) [XX rows x YY columns] XX variants x YY callsets. index = <chrom>_<start>_<end>_<ref>_<alt> :param str url: The url of the ga4gh server. :param str chrom: The chromosome for the region of interest. :param int start: The start position for the region of interest. :param str end: The end position for the region of interest. :return A DataFrame of genotypes within the requested coordinates for all callsets. rtype: DataFrame """ chrom = chrom.replace('chr','') region = chrom+":"+str(start)+"-"+str(end) print ("server:{}, region {}:{}-{}".format(url, chrom, start, end)) try: httpClient = client.HttpClient(url) # Get the datasets on the server. datasets = list(httpClient.search_datasets()) # TODO: Assumption - uses the first dataset. # Get the variantSets in the first dataset. variantSets = list(httpClient.search_variant_sets( dataset_id=datasets[0].id)) # TODO: Assumption - uses the first variantset. # Get the variants in the interval [<start>, <end>) on chromosome <chrom> # in the first variantSet. callSets = list(httpClient.search_call_sets(variantSets[0].id)) iterator = httpClient.search_variants( variant_set_id=variantSets[0].id, reference_name=chrom, start=start, end=end, call_set_ids=[callset.id for callset in callSets]) all_gts = [] for variant in iterator: if snps_only and len(variant.reference_bases) > 1 and len(variant.alternate_bases) > 1: # Only return the bi-allelic snps continue # Use var_id as the index for the DataFrame # This will be used as the key to join on # var_id = <chrom>_<start>_<end>_<ref>_<alt> var_id = "_".join([ variant.reference_name, str(variant.start), str(variant.end), variant.reference_bases, ",".join(variant.alternate_bases)]) # Since genotypes are restricted to bi-allelic snps, the possible # genotypes should be 0/0, 0/1, 1,1 # Summing this -> 0, 1, 2 are the possible genotype values # gts = row of the DataFrame # = [var_id, genotype_callset1, genotype_callset2, ...] gts = [var_id] + [int(sum(call.genotype)) for call in variant.calls] all_gts.append(gts) # columns = [var_id, callset1, callset2, ...] #print("key:{}".format(url+region)) df = pda.DataFrame(all_gts,columns=['index'] + [callset.name for callset in callSets]) results[url+region] = df except: print("Can not query the region:{} from server:{}".format(region,url)); raise
def main(): # First, instantiate an HTTP client using the BASE_URL. c = client.HttpClient(BASE_URL) # If you are using an IDE with autocompletion (like PyCharm) # you should be able to access the named functions by # placing a `.` after the c in your editor. # We'll start by finding the datasets as we did in the # previous example. response = c.search_datasets() # Notice that the client returns a generator so we have # to iterate through the response to get our datasets. print(response) datasets = [] for dataset in response: datasets.append(dataset) print(dataset) # We can repeat the process of collecting all variant # sets as was done in `hello_ga4gh` without fussing # with json. variant_sets = [] for dataset in datasets: # The client provides results as classed objects, # so we can access their attributes using dot-notation. datasetId = dataset.id response = c.search_variant_sets(datasetId) for variant_set in response: variant_sets.append(variant_set) # We'll now pick out a single variant set to do some # analysis on. variant_set = variant_sets[0] variantSetId = variant_set.id variants = c.search_variants(variantSetId, 100000, 900000, "1") # The client manages paging for us, so there may be # a large number of results generated by a search. variant_list = [] for variant in variants: variant_list.append(variant) print(str(len(variant_list)) + " variants.") # Here we will generate the same count of reference base # length as in the previous examples. reference_base_counts = {} for variant in variant_list: reference_base_length = len(variant.reference_bases) if reference_base_length not in reference_base_counts: reference_base_counts[reference_base_length] = 1 else: reference_base_counts[reference_base_length] += 1 # Did we get the same results as in `hello_ga4gh.py`? print(reference_base_counts)
from ga4gh.client import client c = client.HttpClient("http://ga4gh_server:8000") dataset = c.search_datasets().next() print "Individuals:" for individual in c.search_individuals(dataset_id=dataset.id): print "Individual: {}".format(individual.name) print " id: {}".format(individual.id) print " dataset_id: {}".format(individual.dataset_id) print " description: {}".format(individual.description) print "RNA Quantification Sets:" for rna_quant_set in c.search_rna_quantification_sets(dataset_id=dataset.id): print(" id: {}".format(rna_quant_set.id)) print(" dataset_id: {}".format(rna_quant_set.dataset_id)) print(" name: {}\n".format(rna_quant_set.name)) print "RNA Quantifications:" for rna_quant in c.search_rna_quantifications( rna_quantification_set_id=rna_quant_set.id): print("RNA Quantification: {}".format(rna_quant.name)) print(" id: {}".format(rna_quant.id)) print(" description: {}\n".format(rna_quant.description)) print "RNA Expression Levels:" for expression in c.search_expression_levels( rna_quantification_id=rna_quant.id): print("Expression Level: {}".format(expression.name)) print(" id: {}".format(expression.id)) print(" feature: {}".format(expression.feature_id))
def main(): # First, instantiate an HTTP client using the BASE_URL. c = client.HttpClient(BASE_URL) # Now we'll get a variant set. # We can get the first item of an iterator using `.next()`. dataset = c.search_datasets().next() variant_set = c.search_variant_sets(dataset.id).next() # We now collect the variants in that variant set. variants = c.search_variants( variant_set.id, # The ID of the variantSet start=0, # Start position end=100000, # End position reference_name="1") # chrom # And copy them into `variant_list` variant_list = [] for variant in variants: variant_list.append(variant) # Our analysis will make counts of the reference and # alternate base lengths, so let's grab those from # each variant and make lists of the lengths. ref_lengths = [] alt_lengths = [] for variant in variant_list: ref_lengths.append(len(variant.reference_bases)) for base in variant.alternate_bases: alt_lengths.append(len(base)) print(str(len(variant_list)) + " variants.") # Now we can create histograms for each of these lists. # see more examples http://matplotlib.org/1.2.1/examples/pylab_examples/histogram_demo.html plt.figure(1) binning = [x for x in range(1, np.max(ref_lengths) + 1)] n, bins, patches = plt.hist(ref_lengths, bins=binning, facecolor='red', alpha=0.75, log=True) plt.title("Frequency of reference base lengths") plt.xlabel('Length of reference') plt.ylabel('n variants of length (log)') plt.axis([0, len(n), 0, np.max(n)]) plt.figure(2) binning = [x for x in range(1, np.max(alt_lengths) + 1)] m, binsm, patchesm = plt.hist(alt_lengths, bins=binning, facecolor='blue', alpha=0.75, log=True) plt.title("Frequency of alternate base lengths") plt.xlabel('Length of alts') plt.ylabel('n variants of length (log)') plt.axis([0, len(m), 0, np.max(m)]) plt.show()
from __future__ import print_function import ga4gh.client.client as client simons_client = client.HttpClient("http://10.50.100.241/") import json """ ******************** get datasets and variant_sets ******************* """ datasets = list(simons_client.search_datasets()) dataset = simons_client.get_dataset(datasets[0].id) simons_csid_dict = {} for biosample in simons_client.search_biosamples(dataset_id=dataset.id): for variant_set in simons_client.search_variant_sets( dataset_id=dataset.id): if variant_set.name == biosample.info['Individual_id'].values[ 0].string_value: print(variant_set.name) print(variant_set.id) callset = list(simons_client.search_call_sets(variant_set.id))[0] print(callset.id) print(biosample.info['Name'].values[0].string_value) simons_csid_dict[callset.id] = ( biosample.info['Name'].values[0].string_value, variant_set.id) with open("simons_csids.json", 'w') as output_file: json.dump(simons_csid_dict, output_file)