def get_transcript(self, ensg): """Takes ENSG and RefSeq codes to find Ensembl transcript ID and peptide ID. :param ensg: ENSG number. :param refseq: RefSeq NM_ code. :return: transcript and protein IDs. """ # 1) Connect to BioMart server. server = BiomartServer("http://www.ensembl.org/biomart") server.verbose = True # provides setting up details new_list = [] # 2) Select dataset to check against. Not essential but quicker so BioMart doesn't need to search. hs_genes = server.datasets['hsapiens_gene_ensembl'] # 3) This is like clicking Results on the BioMart website. Filter by ENSG and RefSeq to get transcript & peptide results = hs_genes.search({ 'filters': {'ensembl_gene_id': '%s' % ensg, 'refseq_mrna': '%s' % self.refseq}, 'attributes': ['ensembl_transcript_id', 'ensembl_peptide_id'] }, header=1) # 4) Convert to readable format and add relevant information to a list. for line in results.iter_lines(): line = line.decode('utf-8') new_list.append(line.split()) uni_transcript = new_list[1] transcript = uni_transcript[0] protein = uni_transcript[1] return transcript, protein
def serverConnection(proxy=None, verbose=True): server = BiomartServer("http://ensembl.gramene.org/biomart") # if you are behind a proxy if proxy is not None: server.http_proxy = os.environ.get('http_proxy', proxy) # set verbose to True to get some messages server.verbose = verbose return server
def server_connect(self): connected = False while not connected: try: server = BiomartServer("http://grch37.ensembl.org/biomart") connected = True except requests.exceptions.ConnectionError: connected = False print('Connection error') server.verbose = True self.hsapiens_snp = server.datasets['hsapiens_snp']
def GOquery(query, organism, gene_id): # connect to server server = BiomartServer("http://www.biomart.org/biomart") # set verbose to True to get some messages server.verbose = True db = server.datasets[organism + '_gene_ensembl'] response = db.search({ 'filters': { gene_id: query }, 'attributes': [gene_id, 'with_go'] }) return response
def vb_request(path: str, ds: tuple): "Access VectorBase API to get gene positions of requested species" # Init server server = BiomartServer('http://biomart.vectorbase.org/biomart/') # Iterate over requests for gene positions of requested species attr = ('chromosome_name', 'start_position', 'end_position', 'strand', 'ensembl_gene_id') for i, n in enumerate(ds): name = f'{path}/{i}_gene.tsv' with open(name, 'a') as f: rqst = server.datasets[n + '_eg_gene'].search({ 'filters': {}, 'attributes': attr }) for line in rqst.iter_lines(): f.write(line.decode('utf-8') + '\n') # Request for orthologous genes of requested species attr = ['ensembl_gene_id'] + [ f'{ds[i]}_eg_homolog_ensembl_gene' for i in range(1, len(ds)) ] name = path + 'homology_genes.tsv' with open(name, 'w') as f: s = server.datasets[ds[0] + '_eg_gene'].search( { 'filters': {}, 'attributes': attr }, header=1) for line in s.iter_lines(): f.write(line.decode('utf-8') + '\n') return print('VB done.')
def _fetchFromServer(ensemble_server, attributes): server = BiomartServer(ensemble_server) ensmbl = server.datasets['hsapiens_gene_ensembl'] res = pd.read_csv(io.StringIO(ensmbl.search({ 'attributes': attributes }, header=1).content.decode()), sep='\t') return res
def QueryBioMart(dataset, attributes, host=host): server = BiomartServer(host) organism = server.datasets[dataset] response = organism.search({'attributes': attributes}) response = response.content.split("\n") response = [s.split("\t") for s in response] response = pd.DataFrame(response, columns=attributes) return response
def get_transcript(ensg, refseq): server = BiomartServer("http://www.ensembl.org/biomart") server.verbose = True new_list = [] hs_genes = server.datasets['hsapiens_gene_ensembl'] results = hs_genes.search({ 'filters': {'ensembl_gene_id': '%s' % ensg, 'refseq_mrna': '%s' % refseq}, 'attributes': ['ensembl_transcript_id', 'ensembl_peptide_id'] }, header=1) for line in results.iter_lines(): line = line.decode('utf-8') new_list.append(line.split()) uni_transcript = new_list[1] transcript = uni_transcript[0] protein = uni_transcript[1] return transcript, protein
def queryBM(query_attributes, query_dataset, query_filter=None, query_items=None, query_dic=None, host=biomart_host): """ Queries BioMart. :param query_attributes: list of attributes to recover from BioMart :param query_dataset: dataset to query :param query_filter: one BioMart filter associated with the items being queried :param query_items: list of items to be queried (must assoiate with given filter) :param query_dic: for complex queries this option should be used instead of 'filters' and 'items' and a dictionary of filters provided here eg. querydic={"filter1":["item1","item2"],"filter2":["item3","item4"]}. If using querydic, don't query more than 350 items at once. :param host: address of the host server, default='http://www.ensembl.org/biomart' :returns: a Pandas dataframe of the queried attributes """ server = BiomartServer(host) d = server.datasets[query_dataset] res = [] if not query_dic: if query_items: chunks = [ query_items[x:x + 350] for x in xrange(0, len(query_items), 350) ] for c in chunks: response = d.search({ 'filters': { query_filter: c }, 'attributes': query_attributes }) for line in response.iter_lines(): line = line.decode('utf-8') res.append(line.split("\t")) else: response = d.search({'attributes': query_attributes}) for line in response.iter_lines(): line = line.decode('utf-8') res.append(line.split("\t")) elif query_dic: response = d.search({ 'filters': query_dic, 'attributes': query_attributes }) for line in response.iter_lines(): line = line.decode('utf-8') res.append(line.split("\t")) res = pd.DataFrame(res) res.columns = query_attributes return (res)
def main(newfile): atts = ['ensembl_gene_id', 'ensembl_peptide_id'] url = 'http://www.ensembl.org/biomart' server = BiomartServer(url) hge = server.datasets['hsapiens_gene_ensembl'] with open(newfile, 'w') as outF: s = hge.search({'attributes': atts}, header=0) for l in s.iter_lines(): (gene_id, peptide_id) = l.split('\t') if len(peptide_id) > 0: outF.write('%s\t%s\n' % (gene_id.strip(), peptide_id.strip()))
def get_region_from_rsid(rsid_list): region_list = [] if rsid_list: from biomart import BiomartServer server = BiomartServer("http://uswest.ensembl.org/biomart/") db = server.datasets['hsapiens_snp'] response = db.search({'filters': {'variation_source': 'dbSNP', 'snp_filter': rsid_list}, 'attributes': ['refsnp_id', 'chr_name', 'chrom_start', 'chrom_end']}) output = dict() for line in response.iter_lines(): line = line.decode('utf-8').split("\t") try: if line[0] in output.keys(): print('Multiple entries for: {}'.format(line[0])) if line[1] in CONTIG: output[line[0]] = {'chr': line[1], 'start': line[2], 'stop': line[3]} except: if line[0] not in output.keys(): print('Error getting chr pos for: {}'.format(line[0])) pass present = set(output.keys()) if len(present) < len(rsid_list): missing = [x for x in rsid_list if x not in present] print("Missing following rsID: " + ', '.join(missing)) response = db.search({'filters': {'variation_source': 'dbSNP', 'snp_synonym_filter': missing}, 'attributes': ['refsnp_id', 'chr_name', 'chrom_start', 'chrom_end']}) print('Converted the following rsID:') for i, line in enumerate(response.iter_lines()): line = line.decode('utf-8').split("\t") try: if line[1] in CONTIG: output[line[0]] = {'chr': line[1], 'start': line[2], 'stop': line[3]} print(missing[i] + ' -> ' + line[0]) except: print("Error getting chr pos for: {}".format(line[0])) pass region_list = [str(output[x]["chr"]) + ":" + str(output[x]["start"]) + "-" + str(output[x]["stop"]) for x in output if output[x]["chr"] and output[x]["start"] and output[x]["stop"]] return region_list
def datasetsBM(host=biomart_host): """ Lists BioMart datasets. :param host: address of the host server, default='http://www.ensembl.org/biomart' :returns: nothing """ stdout_ = sys.stdout #Keep track of the previous value. stream = StringIO() sys.stdout = stream server = BiomartServer(biomart_host) server.show_datasets() sys.stdout = stdout_ # restore the previous stdout. variable = stream.getvalue() v = variable.replace("{", " ") v = v.replace("}", " ") v = v.replace(": ", "\t") print(v)
def searchDataset(chrom, start, end): from biomart import BiomartServer server = BiomartServer("http://asia.ensembl.org/biomart") server.verbose = True ccrigri = server.datasets["ccrigri_gene_ensembl"] response = ccrigri.search({ "filters": { "chromosome_name": chrom, "start": start, "end": end }, "attributes": [ "ensembl_gene_id", "ensembl_transcript_id", "refseq_mrna", "external_gene_name", "start_position", "end_position", "wikigene_description" ] }) for i in response.iter_lines(): i = i.decode("utf-8") return i
def __init__(self, **kwargs): """ Connect the annotator to Biomart's server. Set self.VERBOSE=True to get info of the queries as they're made. """ super().__init__(**kwargs) self.server = BiomartServer('http://grch37.ensembl.org/biomart') self.server.verbose = self.VERBOSE if self.proxies and 'http' in self.proxies: self.server.http_proxy = self.proxies['http'] self.database = self.server.databases['ENSEMBL_MART_SNP'] self.human_snps = self.database.datasets['hsapiens_snp']
def __init__(self): # GRCh37 is also known as hg19 # server = BiomartServer("http://useast.ensembl.org/biomart") self.server = BiomartServer("http://grch37.ensembl.org/biomart") # set verbose to True to get some messages # server.verbose = True # server.show_databases() self.database = self.server.databases["ENSEMBL_MART_SNP"] # db.show_datasets() self.dataset = self.database.datasets["hsapiens_snp"]
def get_ENSEMBL_data(listID, dataset, transcr_expr_file=None): """Function to connect to ENSEBL and retrieve data. The functions follows two modes of working: 1) Transcript selection scheme acording to ENSEMBL/HAVANA, TSL and APPRIS. (Default) 2) Transcript selection by the best expressed transcript if ther is an externaly provided file. Return: Pandas data frame of the transcripts and their ENSEMBL features. """ print("Connection to ENSEMBL server.", file=sys.stderr) server = BiomartServer("http://www.ensembl.org/biomart/") dt = server.datasets[dataset] print("Retrieve the dataset...", file=sys.stderr) listAttrib = ['ensembl_gene_id', 'ensembl_transcript_id', 'external_gene_name', 'transcript_length', 'transcript_biotype', 'cdna_coding_start', 'cdna_coding_end', 'cdna', 'description'] listAttrib2 = ['ensembl_gene_id', 'ensembl_transcript_id', 'transcript_tsl', 'transcript_appris', 'transcript_source', 'transcript_length', 'transcript_biotype'] print("Fetch data from: {}".format(str(server)), file=sys.stderr) # Collect data from the ENSEMBL datasets. dfFeat = pd.DataFrame() dfTrans = pd.DataFrame() for chunk in chunks(listID, 100): print('Fetching...', file=sys.stderr) res1 = dt.search({'filters': {'ensembl_gene_id': chunk}, 'attributes': listAttrib}, header=1) res2 = dt.search({'filters': {'ensembl_gene_id': chunk}, 'attributes': listAttrib2}, header=1) # Reading stream to a pandas data frame. dataf = pd.read_csv(io.StringIO(res1.text), sep='\t', encoding='utf-8') datat = pd.read_csv(io.StringIO(res2.text), sep='\t', encoding='utf-8') # Cleanup data frame lines that do not correspond to protein coding genes. dataf = dataf[dataf['Transcript type'] == 'protein_coding'] datat = datat[datat['Transcript type'] == 'protein_coding'] # Concatenate data frames. dfFeat = pd.concat([dfFeat, dataf], axis=0, sort=False) dfTrans = pd.concat([dfTrans, datat], axis=0, sort=False) print("...fetch done!", file=sys.stderr) # Function to select transcripts. transcripts = select_transcripts(dfTrans, dfFeat, transcr_expr_file) # Set the size of the UTRs and the CDS by using the information of the coding exon. for index, row in transcripts.iterrows(): # For 5'UTR end take the smallest coordinate in the coding exons. coding_start = min([int(x) for x in row["cDNA coding start"].split(";")]) coding_end = max([int(x) for x in row["cDNA coding end"].split(";")]) # REPLACE the right CDS start and end transcripts.at[index, "cDNA coding start"] = coding_start transcripts.at[index, "cDNA coding end"] = coding_end # Clean up the TSL value. tsl = row["Transcript support level (TSL)"].split()[0] transcripts.at[index, "Transcript support level (TSL)"] = tsl return transcripts
def get_region_from_gene(gene_list): from biomart import BiomartServer server = BiomartServer("http://uswest.ensembl.org/biomart/") db = server.datasets['hsapiens_gene_ensembl'] response = db.search({ 'filters': { 'source': 'ensembl_havana', 'hgnc_symbol': gene_list }, 'attributes': ['hgnc_symbol', 'chromosome_name', 'start_position', 'end_position'] }) output = dict() for line in response.iter_lines(): line = line.decode('utf-8').split("\t") try: if line[0] in output.keys(): print('Multiple entries for: {}'.format(line[0])) print(line) output[line[0]] = { 'chr': int(line[1]), 'start': int(line[2]), 'stop': int(line[3]) } except: if line[0] not in output.keys(): print('Error getting chr pos for: {}'.format(line[0])) pass present = set(output.keys()) if len(present) < len(gene_list): missing = [x for x in gene_list if x not in present] print("Missing following genes: " + ', '.join(missing)) region_list = [ str(output[x]["chr"]) + ":" + str(output[x]["start"]) + "-" + str(output[x]["stop"]) for x in output if output[x]["chr"] and output[x]["start"] and output[x]["stop"] ] return region_list
def gene_name_to_ensembl_id(): atts = ['external_gene_name', 'ensembl_gene_id'] # server = BiomartServer( "http://www.ensembl.org/biomart" ) # latest: not using to keep everything in sync with build 37 server = BiomartServer("http://grch37.ensembl.org/biomart") hge = server.datasets['hsapiens_gene_ensembl'] # collect data from server as list of lists s = hge.search({'attributes': atts}, header=1) biomart_list = list() for l in s.iter_lines(): line = l.decode("utf-8").split("\t") biomart_list.append(line) # convert to pandas df and fix the colnames biomart_table = pd.DataFrame(biomart_list) biomart_table.columns = biomart_table.iloc[0] biomart_table = biomart_table.drop([ 0, ]).drop_duplicates() return biomart_table
def getEnsemblAnnotation(species="hsapiens", filePath=None): from biomart import BiomartServer import pandas as pd server = BiomartServer("http://www.biomart.org/biomart") ensembl = server.databases["ensembl"] genes = ensembl.datasets["{0}_gene_ensembl".format(species)] attributes = ['mgi_symbol'] if species == "mmusculus" else ["hsapiens"] attributes += ['ensembl_gene_id', 'ensembl_transcript_id'] response = genes.search({'attributes': attributes}) results = list() for line in response.iter_lines(): results.append(line.split("\t")) annotation = pd.DataFrame(results, columns=attributes) if filePath is not None: annotation.to_csv(filePath, index=False) else: return annotation
def attributesBM(dataset, host=biomart_host): """ Lists BioMart attributes for a specific dataset. :param dataset: dataset to list attributes of. :param host: address of the host server, default='http://www.ensembl.org/biomart' :returns: nothing """ stdout_ = sys.stdout #Keep track of the previous value. stream = StringIO() sys.stdout = stream server = BiomartServer(host) d = server.datasets[dataset] d.show_attributes() sys.stdout = stdout_ # restore the previous stdout. variable = stream.getvalue() v = variable.replace("{", " ") v = v.replace("}", " ") v = v.replace(": ", "\t") print(v)
def convert_entrez_to_gene_symbol(keyword_list): """ Convert Refseq ID to Gene Symbol and description using Biomart """ #print("\nConverting Refseq to Gene Symbol\n") server = BiomartServer("http://mar2017.archive.ensembl.org/biomart") #server.verbose = True #server.show_databases() #server.show_datasets() hsapiens = server.datasets['hsapiens_gene_ensembl'] #hsapiens.show_filters() #hsapiens.show_attributes() # run a search with custom filters and attributes (no header) response = hsapiens.search({ 'filters': { 'entrezgene': keyword_list }, 'attributes': ['external_gene_name', 'description', 'entrezgene'] }) return response
from biomart import BiomartServer atts = ['external_gene_name','external_gene_source','ensembl_gene_id', 'ensembl_transcript_id','ensembl_peptide_id'] server = BiomartServer( "http://www.biomart.org/biomart" ) hge = server.datasets['hsapiens_gene_ensembl'] print(server.databases['ensembl']) s = hge.search({'attributes': atts}, header=1) for l in s.iter_lines(): print(l)
#Code: Access Biomart #!/usr/bin/python #import BiomartServer from the #biomart module from biomart import BiomartServer #connect to the biomart server server = BiomartServer("http://www.biomart.org/biomart") #show all the server databases on the console server.show_databases() #show all the server datasets on the console server.show_datasets() #use the 'uniprot' dataset uniprot = server.datasets['uniprot'] #run a search with custom filters and attributes #such as searching for results with the #protein name "Dystrophin" #the UniProt accession number, #the protein name and the #gene name are give for #each search result
#!/usr/bin/env python # https://pypi.python.org/pypi/biomart from biomart import BiomartServer #Connect to a Biomart Server server = BiomartServer( "http://www.biomart.org/biomart" ) # set verbose to True to get some messages server.verbose = True
from biomart import BiomartServer server = BiomartServer( "http://www.biomart.org/biomart" ) # if not behind a proxy, otherwise see: https://pypi.python.org/pypi/biomart/0.8.0 server.verbose = True # set verbose to True to get some messages # run a search with custom filters and attributes (no header) response = uniprot.search({ 'filters': { 'accession': ['Q9FMA1', 'Q8LFJ9'] }, 'attributes': [ 'accession', 'protein_name' ] })
# FUNCTIONS ==================================================================== # RUN ========================================================================== # GENE LIST # ------------------------------------------------------------------------------ # Read gene list gene_list = pd.read_csv(gene_list_file, header=None)[0].tolist() # BioMaRt # Connect to the server and prepare for querying # ------------------------------------------------------------------------------ # Connect to biomart server = BiomartServer("http://www.ensembl.org/biomart") server.verbose = False # Check available databases #server.show_databases() # Select Genes database db = server.databases['ENSEMBL_MART_ENSEMBL'] # Check available datasets (species) #db.show_datasets() # Select H. sapiens dataset ds = db.datasets['hsapiens_gene_ensembl'] # show all available filters and attributes of the 'uniprot' dataset
def biomart_anno(url, loca): urlp = urlparse(url) species = urlp.path.split('/')[1] code = species.split('_')[0][0].lower() + species.split('_')[1] server = BiomartServer('%s://%s/biomart' % (urlp.scheme, urlp.netloc)) datasets = server.datasets print("Connecting to Ensembl biomart...") _ = ['metazoa', 'plants', 'fungi', 'bacteria', 'protists'] dn = code + ('_eg_gene' if urlp.netloc.split('.')[0] in _ else '_gene_ensembl') ds = datasets[dn] os.system('mkdir -p %s' % loca) S = ds.search({'attributes': ['ensembl_gene_id', 'gene_biotype', \ 'external_gene_name', 'description', 'chromosome_name', 'start_position', \ 'end_position', 'strand']}) gene_infor = pd.DataFrame.from_records( [str(i, encoding = 'utf-8').split('\t') for i in S.iter_lines()], \ columns = ['gene', 'gene_biotype', 'gene_name', 'gene_description', \ 'chromosome_name', 'start_position', 'end_position', 'strand']) m = {"-1": "-", "1": "+"} gene_infor['strand'] = [ m[i] if i in m else str(i) for i in gene_infor['strand'] ] gene_infor['gene_position'] = gene_infor.loc[:, ['chromosome_name', \ 'start_position', 'end_position', 'strand']].apply(\ lambda x: ':'.join(x), axis=1) gene_infor.drop(['chromosome_name', 'start_position', 'end_position', \ 'strand'], axis = 1, inplace=True) #### gene2GO = getGO(ds, loca) if gene2GO.shape[0] != 0: g = gene2GO.groupby('gene')['GO_id'].apply(lambda x: ', '.join(x)) gene_infor['GO_id'] = [ g[i] if i in g else '' for i in gene_infor['gene'] ] #### gene2kegg = getKEGG(ds, loca) if gene2kegg.shape[0] != 0: k = gene2kegg.groupby('gene')['KEGG_enzyme'].apply( lambda x: ', '.join(x)) gene_infor['KEGG_enzyme'] = [ k[i] if i in k else '' for i in gene_infor['gene'] ] #### gene2entrez = getEntrezgene(ds, loca) if gene2entrez.shape[0] != 0: e = gene2entrez.groupby('gene')['entrez'].apply(lambda x: ', '.join(x)) gene_infor['entrez'] = [ e[i] if i in e else '' for i in gene_infor['gene'] ] #### gene2swissprot = getSwissProt(ds, loca) if gene2swissprot.shape[0] != 0: s = gene2swissprot.groupby('gene')['SwissProt'].apply( lambda x: ', '.join(x)) gene_infor['SwissProt'] = [ s[i] if i in s else '' for i in gene_infor['gene'] ] gene_infor.to_csv(loca + '/gene.infor.tsv', sep='\t', index=False) print('saved %d records to %s/gene.infor.tsv' % (gene_infor.shape[0], loca))
import os from glob import glob from tqdm import tqdm import pandas as pd import click from sklearn.linear_model import LogisticRegression import numpy as np from biomart import BiomartServer import readquant print('Connecting to biomart...') SERVER = BiomartServer("http://www.ensembl.org/biomart") ENSEMBL = SERVER.datasets['hsapiens_gene_ensembl'] def get_ERCC(): return readquant.data.ERCC()['concentration in Mix 1 (attomoles/ul)'] def get_MT(): print('Get MT genes from biomart...') r = ENSEMBL.search({ 'filters': { 'chromosome_name': 'MT' }, 'attributes': ['ensembl_gene_id'] }) idx = pd.Index(r.text.split())
''' from biomart import BiomartServer urls = { '92': 'http://apr2018.archive.ensembl.org/biomart', '91': 'http://dec2017.archive.ensembl.org/biomart', '90': 'http://aug2017.archive.ensembl.org/biomart', '89': 'http://may2017.archive.ensembl.org/biomart', '88': 'http://mar2017.archive.ensembl.org/biomart', '87': 'http://dec2016.archive.ensembl.org/biomart', '86': 'http://oct2016.archive.ensembl.org/biomart', '85': 'http://jul2016.archive.ensembl.org/biomart', '84': 'http://mar2016.archive.ensembl.org/biomart', '83': 'http://dec2015.archive.ensembl.org/biomart', '82': 'http://sep2015.archive.ensembl.org/biomart', '81': 'http://jul2015.archive.ensembl.org/biomart', '80': 'http://may2015.archive.ensembl.org/biomart', '79': 'http://mar2015.archive.ensembl.org/biomart', '78': 'http://dec2014.archive.ensembl.org/biomart', '77': 'http://oct2014.archive.ensembl.org/biomart', '76': 'http://aug2014.archive.ensembl.org/biomart', '75': 'http://feb2014.archive.ensembl.org/biomart', '74': 'http://dec2013.archive.ensembl.org/biomart' } server = BiomartServer( urls['${ENSEMBL_VERSION}'] ) tags = { 'pfam': 'Pfam', 'scanprosite': 'Prosite', 'prosite': 'Prosite' } ensembl = server.datasets['hsapiens_gene_ensembl'] response = ensembl.search({ 'attributes': [ 'ensembl_transcript_id', 'transcript_version', '${DB}', '${DB}_start', '${DB}_end' ] }) with open('${DB}_features.tsv', 'w') as OUT: for line in response.iter_lines(): line = line.decode('utf-8') t,v,f,s,e = line.split("\\t") if f: OUT.write('{}.{}\\t{}\\t{}\\t{}\\t{}\\n'. format(t, v, tags['${DB}'], f, s, e))
def __init__(self): self.__server = BiomartServer("http://www.ensembl.org/biomart")
from biomart import BiomartServer import json server = BiomartServer('http://biomart.vectorbase.org/biomart') steph = server.datasets['astephensi_eg_gene'] response = steph.search({'attributes': ['ensembl_gene_id']}) genes = [] for line in response.iter_lines(): line = line.decode('utf-8') genes.append(line) with open('./data/genes.json', 'w') as file: json.dump(genes, file)
from biomart import BiomartServer from Clean import clean server = BiomartServer( "http://biomart.vectorbase.org/biomart/" ) #'vb_gene_mart_1708': VectorBase Genes database #'agambiae_eg_gene': Anopheles gambiae genes (AgamP4) dataset #'agambiae_eg_genomic_sequence': Anopheles gambiae sequences (AgamP4) # show server databases 'server.show_databases()' # show server datasets 'server.show_datasets()' # show all available filters and attributes of the 'uniprot' dataset: # 'uniprot.show_filters()','uniprot.show_attributes()' #use 'a_gambiae' dataset #'ensembl_gene_id': 'Gene stable ID', 'strand': 'Strand', #'start_position': 'Gene start (bp)', 'end_position': 'Gene end (bp)','chromosome_name': 'Chromosome/scaffold name' def vb_import(ds): r = server.datasets[ds].search({ 'filters': {}, 'attributes': [ 'chromosome_name', 'start_position', 'end_position', 'strand', 'ensembl_gene_id' ] }, header = 1 ) file = open('test_response.txt', 'r') # response format is TSV for line in r.iter_lines(): line = line.decode('utf-8') print(line) file.close() #vb_import('agambiae_eg_gene') abc = server.datasets['agambiae_eg_gene'] #abc.show_attributes()
import getopt, sys from biomart import BiomartServer from io import StringIO from os import listdir from os import walk import glob import pandas as pd server = BiomartServer('http://www.ensembl.org/biomart') hsap = server.datasets['hsapiens_gene_ensembl'] def strand_fix(i): if i == 1: return ('+') elif i == -1: return ('-') else: return ('invalid strand') def get_bed_data(extGeneNames): response = hsap.search( { 'filters': { 'external_gene_name': extGeneNames }, 'attributes': [ "chromosome_name", "start_position", "end_position", "strand", "external_gene_name" ]