Пример #1
0
    def get_transcript(self, ensg):
        """Takes ENSG and RefSeq codes to find Ensembl transcript ID and peptide ID.

        :param ensg: ENSG number.
        :param refseq: RefSeq NM_ code.
        :return: transcript and protein IDs.
        """
        # 1) Connect to BioMart server.
        server = BiomartServer("http://www.ensembl.org/biomart")
        server.verbose = True  # provides setting up details
        new_list = []
        # 2) Select dataset to check against. Not essential but quicker so BioMart doesn't need to search.
        hs_genes = server.datasets['hsapiens_gene_ensembl']
        # 3) This is like clicking Results on the BioMart website. Filter by ENSG and RefSeq to get transcript & peptide
        results = hs_genes.search({
            'filters': {'ensembl_gene_id': '%s' % ensg,
                        'refseq_mrna': '%s' % self.refseq},
            'attributes': ['ensembl_transcript_id', 'ensembl_peptide_id']
        }, header=1)
        # 4) Convert to readable format and add relevant information to a list.
        for line in results.iter_lines():
            line = line.decode('utf-8')
            new_list.append(line.split())
        uni_transcript = new_list[1]
        transcript = uni_transcript[0]
        protein = uni_transcript[1]

        return transcript, protein
Пример #2
0
def serverConnection(proxy=None, verbose=True):
    server = BiomartServer("http://ensembl.gramene.org/biomart")
    # if you are behind a proxy
    if proxy is not None:
        server.http_proxy = os.environ.get('http_proxy', proxy)
    # set verbose to True to get some messages
    server.verbose = verbose
    return server
Пример #3
0
    def server_connect(self):
        connected = False

        while not connected:
            try:
                server = BiomartServer("http://grch37.ensembl.org/biomart")
                connected = True
            except requests.exceptions.ConnectionError:
                connected = False
                print('Connection error')

        server.verbose = True
        self.hsapiens_snp = server.datasets['hsapiens_snp']
def GOquery(query, organism, gene_id):
    # connect to server
    server = BiomartServer("http://www.biomart.org/biomart")
    # set verbose to True to get some messages
    server.verbose = True

    db = server.datasets[organism + '_gene_ensembl']

    response = db.search({
        'filters': {
            gene_id: query
        },
        'attributes': [gene_id, 'with_go']
    })
    return response
Пример #5
0
def vb_request(path: str, ds: tuple):
    "Access VectorBase API to get gene positions of requested species"
    # Init server
    server = BiomartServer('http://biomart.vectorbase.org/biomart/')
    # Iterate over requests for gene positions of requested species
    attr = ('chromosome_name', 'start_position', 'end_position', 'strand',
            'ensembl_gene_id')
    for i, n in enumerate(ds):
        name = f'{path}/{i}_gene.tsv'
        with open(name, 'a') as f:
            rqst = server.datasets[n + '_eg_gene'].search({
                'filters': {},
                'attributes': attr
            })
            for line in rqst.iter_lines():
                f.write(line.decode('utf-8') + '\n')

    # Request for orthologous genes of requested species
    attr = ['ensembl_gene_id'] + [
        f'{ds[i]}_eg_homolog_ensembl_gene' for i in range(1, len(ds))
    ]
    name = path + 'homology_genes.tsv'
    with open(name, 'w') as f:
        s = server.datasets[ds[0] + '_eg_gene'].search(
            {
                'filters': {},
                'attributes': attr
            }, header=1)
        for line in s.iter_lines():
            f.write(line.decode('utf-8') + '\n')
    return print('VB done.')
Пример #6
0
def _fetchFromServer(ensemble_server, attributes):
  server = BiomartServer(ensemble_server)
  ensmbl = server.datasets['hsapiens_gene_ensembl']
  res = pd.read_csv(io.StringIO(ensmbl.search({
    'attributes': attributes
  }, header=1).content.decode()), sep='\t')
  return res
Пример #7
0
 def QueryBioMart(dataset, attributes, host=host):
     server = BiomartServer(host)
     organism = server.datasets[dataset]
     response = organism.search({'attributes': attributes})
     response = response.content.split("\n")
     response = [s.split("\t") for s in response]
     response = pd.DataFrame(response, columns=attributes)
     return response
Пример #8
0
def get_transcript(ensg, refseq):
    server = BiomartServer("http://www.ensembl.org/biomart")
    server.verbose = True
    new_list = []
    hs_genes = server.datasets['hsapiens_gene_ensembl']
    results = hs_genes.search({
        'filters': {'ensembl_gene_id': '%s' % ensg,
                    'refseq_mrna': '%s' % refseq},
        'attributes': ['ensembl_transcript_id', 'ensembl_peptide_id']
    }, header=1)
    for line in results.iter_lines():
        line = line.decode('utf-8')
        new_list.append(line.split())
    uni_transcript = new_list[1]
    transcript = uni_transcript[0]
    protein = uni_transcript[1]
    return transcript, protein
Пример #9
0
def queryBM(query_attributes,
            query_dataset,
            query_filter=None,
            query_items=None,
            query_dic=None,
            host=biomart_host):
    """
    Queries BioMart.

    :param query_attributes: list of attributes to recover from BioMart
    :param query_dataset: dataset to query
    :param query_filter: one BioMart filter associated with the items being queried
    :param query_items: list of items to be queried (must assoiate with given filter)
    :param query_dic: for complex queries this option should be used instead of 'filters' and 'items' and a dictionary of filters provided here eg. querydic={"filter1":["item1","item2"],"filter2":["item3","item4"]}. If using querydic, don't query more than 350 items at once.
    :param host: address of the host server, default='http://www.ensembl.org/biomart'

    :returns: a Pandas dataframe of the queried attributes

    """
    server = BiomartServer(host)
    d = server.datasets[query_dataset]
    res = []

    if not query_dic:
        if query_items:
            chunks = [
                query_items[x:x + 350]
                for x in xrange(0, len(query_items), 350)
            ]
            for c in chunks:
                response = d.search({
                    'filters': {
                        query_filter: c
                    },
                    'attributes': query_attributes
                })
                for line in response.iter_lines():
                    line = line.decode('utf-8')
                    res.append(line.split("\t"))
        else:
            response = d.search({'attributes': query_attributes})
            for line in response.iter_lines():
                line = line.decode('utf-8')
                res.append(line.split("\t"))

    elif query_dic:
        response = d.search({
            'filters': query_dic,
            'attributes': query_attributes
        })
        for line in response.iter_lines():
            line = line.decode('utf-8')
            res.append(line.split("\t"))
    res = pd.DataFrame(res)
    res.columns = query_attributes
    return (res)
def main(newfile):
    atts = ['ensembl_gene_id', 'ensembl_peptide_id']
    url = 'http://www.ensembl.org/biomart'
    server = BiomartServer(url)
    hge = server.datasets['hsapiens_gene_ensembl']
    with open(newfile, 'w') as outF:
        s = hge.search({'attributes': atts}, header=0)
        for l in s.iter_lines():
            (gene_id, peptide_id) = l.split('\t')
            if len(peptide_id) > 0:
                outF.write('%s\t%s\n' % (gene_id.strip(), peptide_id.strip()))
Пример #11
0
def get_region_from_rsid(rsid_list):
    region_list = []

    if rsid_list:
        from biomart import BiomartServer
        server = BiomartServer("http://uswest.ensembl.org/biomart/")
        db = server.datasets['hsapiens_snp']
        response = db.search({'filters': {'variation_source': 'dbSNP',
                                          'snp_filter': rsid_list},
                              'attributes': ['refsnp_id', 'chr_name', 'chrom_start', 'chrom_end']})

        output = dict()
        for line in response.iter_lines():
            line = line.decode('utf-8').split("\t")
            try:
                if line[0] in output.keys():
                    print('Multiple entries for: {}'.format(line[0]))

                if line[1] in CONTIG:
                    output[line[0]] = {'chr': line[1],
                                       'start': line[2],
                                       'stop': line[3]}

            except:
                if line[0] not in output.keys():
                    print('Error getting chr pos for: {}'.format(line[0]))
                pass

        present = set(output.keys())
        if len(present) < len(rsid_list):
            missing = [x for x in rsid_list if x not in present]
            print("Missing following rsID: " + ', '.join(missing))

            response = db.search({'filters': {'variation_source': 'dbSNP',
                                              'snp_synonym_filter': missing},
                                  'attributes': ['refsnp_id', 'chr_name', 'chrom_start', 'chrom_end']})

            print('Converted the following rsID:')
            for i, line in enumerate(response.iter_lines()):
                line = line.decode('utf-8').split("\t")
                try:
                    if line[1] in CONTIG:
                        output[line[0]] = {'chr': line[1],
                                           'start': line[2],
                                           'stop': line[3]}
                    print(missing[i] + ' -> ' + line[0])
                except:
                    print("Error getting chr pos for: {}".format(line[0]))
                    pass

        region_list = [str(output[x]["chr"]) + ":" + str(output[x]["start"]) + "-" + str(output[x]["stop"]) for x in output
                       if output[x]["chr"] and output[x]["start"] and output[x]["stop"]]

    return region_list
Пример #12
0
def datasetsBM(host=biomart_host):
    """
    Lists BioMart datasets.

    :param host: address of the host server, default='http://www.ensembl.org/biomart'

    :returns: nothing

    """
    stdout_ = sys.stdout  #Keep track of the previous value.
    stream = StringIO()
    sys.stdout = stream
    server = BiomartServer(biomart_host)
    server.show_datasets()
    sys.stdout = stdout_  # restore the previous stdout.
    variable = stream.getvalue()
    v = variable.replace("{", " ")
    v = v.replace("}", " ")
    v = v.replace(": ", "\t")
    print(v)
Пример #13
0
def searchDataset(chrom, start, end):
    from biomart import BiomartServer
    server = BiomartServer("http://asia.ensembl.org/biomart")
    server.verbose = True
    ccrigri = server.datasets["ccrigri_gene_ensembl"]
    response = ccrigri.search({
        "filters": {
            "chromosome_name": chrom,
            "start": start,
            "end": end
        },
        "attributes": [
            "ensembl_gene_id", "ensembl_transcript_id", "refseq_mrna",
            "external_gene_name", "start_position", "end_position",
            "wikigene_description"
        ]
    })
    for i in response.iter_lines():
        i = i.decode("utf-8")
        return i
 def __init__(self, **kwargs):
     """
     Connect the annotator to Biomart's server. Set self.VERBOSE=True to get
     info of the queries as they're made.
     """
     super().__init__(**kwargs)
     self.server = BiomartServer('http://grch37.ensembl.org/biomart')
     self.server.verbose = self.VERBOSE
     if self.proxies and 'http' in self.proxies:
         self.server.http_proxy = self.proxies['http']
     self.database = self.server.databases['ENSEMBL_MART_SNP']
     self.human_snps = self.database.datasets['hsapiens_snp']
Пример #15
0
    def __init__(self):
        # GRCh37 is also known as hg19

        # server = BiomartServer("http://useast.ensembl.org/biomart")
        self.server = BiomartServer("http://grch37.ensembl.org/biomart")

        # set verbose to True to get some messages
        # server.verbose = True

        # server.show_databases()

        self.database = self.server.databases["ENSEMBL_MART_SNP"]

        # db.show_datasets()

        self.dataset = self.database.datasets["hsapiens_snp"]
Пример #16
0
def get_ENSEMBL_data(listID, dataset, transcr_expr_file=None):
    """Function to connect to ENSEBL and retrieve data.

    The functions follows two modes of working:
    1) Transcript selection scheme acording to ENSEMBL/HAVANA, TSL and APPRIS. (Default)
    2) Transcript selection by the best expressed transcript if ther is an externaly provided file.

    Return: Pandas data frame of the transcripts and their ENSEMBL features.
    """
    print("Connection to ENSEMBL server.", file=sys.stderr)
    server = BiomartServer("http://www.ensembl.org/biomart/")
    dt = server.datasets[dataset]
    print("Retrieve the dataset...", file=sys.stderr)
    listAttrib = ['ensembl_gene_id', 'ensembl_transcript_id', 'external_gene_name', 'transcript_length', 'transcript_biotype', 'cdna_coding_start', 'cdna_coding_end', 'cdna', 'description']
    listAttrib2 = ['ensembl_gene_id', 'ensembl_transcript_id', 'transcript_tsl', 'transcript_appris', 'transcript_source', 'transcript_length', 'transcript_biotype']
    print("Fetch data from: {}".format(str(server)), file=sys.stderr)
    # Collect data from the ENSEMBL datasets.
    dfFeat = pd.DataFrame()
    dfTrans = pd.DataFrame()
    for chunk in chunks(listID, 100):
        print('Fetching...', file=sys.stderr)
        res1 = dt.search({'filters': {'ensembl_gene_id': chunk}, 'attributes': listAttrib}, header=1)
        res2 = dt.search({'filters': {'ensembl_gene_id': chunk}, 'attributes': listAttrib2}, header=1)
        # Reading stream to a pandas data frame.
        dataf = pd.read_csv(io.StringIO(res1.text), sep='\t', encoding='utf-8')
        datat = pd.read_csv(io.StringIO(res2.text), sep='\t', encoding='utf-8')
        # Cleanup data frame lines that do not correspond to protein coding genes.
        dataf = dataf[dataf['Transcript type'] == 'protein_coding']
        datat = datat[datat['Transcript type'] == 'protein_coding']
        # Concatenate data frames.
        dfFeat = pd.concat([dfFeat, dataf], axis=0, sort=False)
        dfTrans = pd.concat([dfTrans, datat], axis=0, sort=False)
    print("...fetch done!", file=sys.stderr)
    # Function to select transcripts.
    transcripts = select_transcripts(dfTrans, dfFeat, transcr_expr_file)
    # Set the size of the UTRs and the CDS by using the information of the coding exon.
    for index, row in transcripts.iterrows():
        # For 5'UTR end take the smallest coordinate in the coding exons.
        coding_start = min([int(x) for x in row["cDNA coding start"].split(";")])
        coding_end = max([int(x) for x in row["cDNA coding end"].split(";")])
        # REPLACE the right CDS start and end
        transcripts.at[index, "cDNA coding start"] = coding_start
        transcripts.at[index, "cDNA coding end"] = coding_end
        # Clean up the TSL value.
        tsl = row["Transcript support level (TSL)"].split()[0]
        transcripts.at[index, "Transcript support level (TSL)"] = tsl
    return transcripts
Пример #17
0
def get_region_from_gene(gene_list):
    from biomart import BiomartServer

    server = BiomartServer("http://uswest.ensembl.org/biomart/")
    db = server.datasets['hsapiens_gene_ensembl']
    response = db.search({
        'filters': {
            'source': 'ensembl_havana',
            'hgnc_symbol': gene_list
        },
        'attributes':
        ['hgnc_symbol', 'chromosome_name', 'start_position', 'end_position']
    })

    output = dict()
    for line in response.iter_lines():
        line = line.decode('utf-8').split("\t")
        try:
            if line[0] in output.keys():
                print('Multiple entries for: {}'.format(line[0]))
                print(line)

            output[line[0]] = {
                'chr': int(line[1]),
                'start': int(line[2]),
                'stop': int(line[3])
            }
        except:
            if line[0] not in output.keys():
                print('Error getting chr pos for: {}'.format(line[0]))
            pass

    present = set(output.keys())
    if len(present) < len(gene_list):
        missing = [x for x in gene_list if x not in present]
        print("Missing following genes: " + ', '.join(missing))

    region_list = [
        str(output[x]["chr"]) + ":" + str(output[x]["start"]) + "-" +
        str(output[x]["stop"]) for x in output
        if output[x]["chr"] and output[x]["start"] and output[x]["stop"]
    ]

    return region_list
Пример #18
0
def gene_name_to_ensembl_id():
    atts = ['external_gene_name', 'ensembl_gene_id']
    # server = BiomartServer( "http://www.ensembl.org/biomart" ) # latest:  not using to keep everything in sync with build 37
    server = BiomartServer("http://grch37.ensembl.org/biomart")
    hge = server.datasets['hsapiens_gene_ensembl']

    # collect data from server as list of lists
    s = hge.search({'attributes': atts}, header=1)
    biomart_list = list()
    for l in s.iter_lines():
        line = l.decode("utf-8").split("\t")
        biomart_list.append(line)

    # convert to pandas df and fix the colnames
    biomart_table = pd.DataFrame(biomart_list)
    biomart_table.columns = biomart_table.iloc[0]
    biomart_table = biomart_table.drop([
        0,
    ]).drop_duplicates()
    return biomart_table
Пример #19
0
def getEnsemblAnnotation(species="hsapiens", filePath=None):
    from biomart import BiomartServer
    import pandas as pd

    server = BiomartServer("http://www.biomart.org/biomart")
    ensembl = server.databases["ensembl"]
    genes = ensembl.datasets["{0}_gene_ensembl".format(species)]

    attributes = ['mgi_symbol'] if species == "mmusculus" else ["hsapiens"]
    attributes += ['ensembl_gene_id', 'ensembl_transcript_id']
    response = genes.search({'attributes': attributes})

    results = list()
    for line in response.iter_lines():
        results.append(line.split("\t"))

    annotation = pd.DataFrame(results, columns=attributes)
    if filePath is not None:
        annotation.to_csv(filePath, index=False)
    else:
        return annotation
Пример #20
0
def attributesBM(dataset, host=biomart_host):
    """
    Lists BioMart attributes for a specific dataset.

    :param dataset: dataset to list attributes of.
    :param host: address of the host server, default='http://www.ensembl.org/biomart'

    :returns: nothing

    """
    stdout_ = sys.stdout  #Keep track of the previous value.
    stream = StringIO()
    sys.stdout = stream
    server = BiomartServer(host)
    d = server.datasets[dataset]
    d.show_attributes()
    sys.stdout = stdout_  # restore the previous stdout.
    variable = stream.getvalue()
    v = variable.replace("{", " ")
    v = v.replace("}", " ")
    v = v.replace(": ", "\t")
    print(v)
Пример #21
0
def convert_entrez_to_gene_symbol(keyword_list):
    """
    Convert Refseq ID to Gene Symbol and description using Biomart
    """
    #print("\nConverting Refseq to Gene Symbol\n")
    server = BiomartServer("http://mar2017.archive.ensembl.org/biomart")
    #server.verbose = True
    #server.show_databases()
    #server.show_datasets()

    hsapiens = server.datasets['hsapiens_gene_ensembl']
    #hsapiens.show_filters()
    #hsapiens.show_attributes()

    # run a search with custom filters and attributes (no header)
    response = hsapiens.search({
        'filters': {
            'entrezgene': keyword_list
        },
        'attributes': ['external_gene_name', 'description', 'entrezgene']
    })

    return response
Пример #22
0
from biomart import BiomartServer

atts = ['external_gene_name','external_gene_source','ensembl_gene_id',
        'ensembl_transcript_id','ensembl_peptide_id']

server = BiomartServer( "http://www.biomart.org/biomart" )
hge = server.datasets['hsapiens_gene_ensembl']

print(server.databases['ensembl'])
s = hge.search({'attributes': atts}, header=1)
for l in s.iter_lines():
    print(l)
Пример #23
0
#Code: Access Biomart 
#!/usr/bin/python  
#import BiomartServer from the  
#biomart module 

from biomart import BiomartServer 

#connect to the biomart server 

server = BiomartServer("http://www.biomart.org/biomart") 

#show all the server databases on the console 

server.show_databases()  

#show all the  server datasets on the console 

server.show_datasets() 

#use the 'uniprot' dataset 

uniprot = server.datasets['uniprot'] 

#run a search with custom filters and attributes 
#such as searching for results with the  
#protein name "Dystrophin"  
#the UniProt accession number, 
#the protein name and the  
#gene name are give for  
#each search result 
Пример #24
0
#!/usr/bin/env python
# https://pypi.python.org/pypi/biomart
from biomart import BiomartServer

#Connect to a Biomart Server
server = BiomartServer( "http://www.biomart.org/biomart" )

# set verbose to True to get some messages
server.verbose = True
Пример #25
0
from biomart import BiomartServer

server = BiomartServer( "http://www.biomart.org/biomart" )  # if not behind a proxy, otherwise see: https://pypi.python.org/pypi/biomart/0.8.0

server.verbose = True  # set verbose to True to get some messages

# run a search with custom filters and attributes (no header)
response = uniprot.search({
    'filters': {
        'accession': ['Q9FMA1', 'Q8LFJ9']
        },
    'attributes': [
        'accession', 'protein_name'
    ]
})
Пример #26
0
# FUNCTIONS ====================================================================

# RUN ==========================================================================

# GENE LIST
# ------------------------------------------------------------------------------

# Read gene list
gene_list = pd.read_csv(gene_list_file, header=None)[0].tolist()

# BioMaRt
# Connect to the server and prepare for querying
# ------------------------------------------------------------------------------

# Connect to biomart
server = BiomartServer("http://www.ensembl.org/biomart")
server.verbose = False

# Check available databases
#server.show_databases()

# Select Genes database
db = server.databases['ENSEMBL_MART_ENSEMBL']

# Check available datasets (species)
#db.show_datasets()

# Select H. sapiens dataset
ds = db.datasets['hsapiens_gene_ensembl']

# show all available filters and attributes of the 'uniprot' dataset
def biomart_anno(url, loca):
    urlp = urlparse(url)

    species = urlp.path.split('/')[1]
    code = species.split('_')[0][0].lower() + species.split('_')[1]

    server = BiomartServer('%s://%s/biomart' % (urlp.scheme, urlp.netloc))
    datasets = server.datasets

    print("Connecting to Ensembl biomart...")

    _ = ['metazoa', 'plants', 'fungi', 'bacteria', 'protists']
    dn = code + ('_eg_gene'
                 if urlp.netloc.split('.')[0] in _ else '_gene_ensembl')
    ds = datasets[dn]

    os.system('mkdir -p %s' % loca)

    S = ds.search({'attributes': ['ensembl_gene_id', 'gene_biotype', \
    'external_gene_name', 'description', 'chromosome_name', 'start_position', \
    'end_position', 'strand']})

    gene_infor = pd.DataFrame.from_records(
    [str(i, encoding = 'utf-8').split('\t') for i in S.iter_lines()], \
    columns = ['gene', 'gene_biotype', 'gene_name', 'gene_description', \
    'chromosome_name', 'start_position', 'end_position', 'strand'])

    m = {"-1": "-", "1": "+"}
    gene_infor['strand'] = [
        m[i] if i in m else str(i) for i in gene_infor['strand']
    ]

    gene_infor['gene_position'] = gene_infor.loc[:, ['chromosome_name', \
    'start_position', 'end_position', 'strand']].apply(\
    lambda x: ':'.join(x), axis=1)

    gene_infor.drop(['chromosome_name', 'start_position', 'end_position', \
    'strand'], axis = 1, inplace=True)

    ####
    gene2GO = getGO(ds, loca)
    if gene2GO.shape[0] != 0:
        g = gene2GO.groupby('gene')['GO_id'].apply(lambda x: ', '.join(x))
        gene_infor['GO_id'] = [
            g[i] if i in g else '' for i in gene_infor['gene']
        ]

    ####
    gene2kegg = getKEGG(ds, loca)
    if gene2kegg.shape[0] != 0:
        k = gene2kegg.groupby('gene')['KEGG_enzyme'].apply(
            lambda x: ', '.join(x))
        gene_infor['KEGG_enzyme'] = [
            k[i] if i in k else '' for i in gene_infor['gene']
        ]

    ####
    gene2entrez = getEntrezgene(ds, loca)
    if gene2entrez.shape[0] != 0:
        e = gene2entrez.groupby('gene')['entrez'].apply(lambda x: ', '.join(x))
        gene_infor['entrez'] = [
            e[i] if i in e else '' for i in gene_infor['gene']
        ]

    ####
    gene2swissprot = getSwissProt(ds, loca)
    if gene2swissprot.shape[0] != 0:
        s = gene2swissprot.groupby('gene')['SwissProt'].apply(
            lambda x: ', '.join(x))
        gene_infor['SwissProt'] = [
            s[i] if i in s else '' for i in gene_infor['gene']
        ]

    gene_infor.to_csv(loca + '/gene.infor.tsv', sep='\t', index=False)

    print('saved %d records to %s/gene.infor.tsv' %
          (gene_infor.shape[0], loca))
Пример #28
0
import os
from glob import glob

from tqdm import tqdm
import pandas as pd
import click
from sklearn.linear_model import LogisticRegression
import numpy as np
from biomart import BiomartServer

import readquant

print('Connecting to biomart...')
SERVER = BiomartServer("http://www.ensembl.org/biomart")
ENSEMBL = SERVER.datasets['hsapiens_gene_ensembl']


def get_ERCC():
    return readquant.data.ERCC()['concentration in Mix 1 (attomoles/ul)']


def get_MT():
    print('Get MT genes from biomart...')
    r = ENSEMBL.search({
        'filters': {
            'chromosome_name': 'MT'
        },
        'attributes': ['ensembl_gene_id']
    })

    idx = pd.Index(r.text.split())
Пример #29
0
#Code: Access Biomart
#!/usr/bin/python
#import BiomartServer from the
#biomart module

from biomart import BiomartServer

#connect to the biomart server

server = BiomartServer("http://www.biomart.org/biomart")

#show all the server databases on the console

server.show_databases()

#show all the  server datasets on the console

server.show_datasets()

#use the 'uniprot' dataset

uniprot = server.datasets['uniprot']

#run a search with custom filters and attributes
#such as searching for results with the
#protein name "Dystrophin"
#the UniProt accession number,
#the protein name and the
#gene name are give for
#each search result
Пример #30
0
'''

from biomart import BiomartServer

urls = { '92': 'http://apr2018.archive.ensembl.org/biomart', '91': 'http://dec2017.archive.ensembl.org/biomart',
         '90': 'http://aug2017.archive.ensembl.org/biomart', '89': 'http://may2017.archive.ensembl.org/biomart',
         '88': 'http://mar2017.archive.ensembl.org/biomart', '87': 'http://dec2016.archive.ensembl.org/biomart',
         '86': 'http://oct2016.archive.ensembl.org/biomart', '85': 'http://jul2016.archive.ensembl.org/biomart',
         '84': 'http://mar2016.archive.ensembl.org/biomart', '83': 'http://dec2015.archive.ensembl.org/biomart',
         '82': 'http://sep2015.archive.ensembl.org/biomart', '81': 'http://jul2015.archive.ensembl.org/biomart',
         '80': 'http://may2015.archive.ensembl.org/biomart', '79': 'http://mar2015.archive.ensembl.org/biomart',
         '78': 'http://dec2014.archive.ensembl.org/biomart', '77': 'http://oct2014.archive.ensembl.org/biomart',
         '76': 'http://aug2014.archive.ensembl.org/biomart', '75': 'http://feb2014.archive.ensembl.org/biomart',
         '74': 'http://dec2013.archive.ensembl.org/biomart' }

server = BiomartServer( urls['${ENSEMBL_VERSION}'] )
tags = { 'pfam': 'Pfam', 'scanprosite': 'Prosite', 'prosite': 'Prosite' }

ensembl = server.datasets['hsapiens_gene_ensembl']

response = ensembl.search({
    'attributes': [ 'ensembl_transcript_id', 'transcript_version', '${DB}', '${DB}_start', '${DB}_end' ]
})

with open('${DB}_features.tsv', 'w') as OUT:
    for line in response.iter_lines():
        line = line.decode('utf-8')
        t,v,f,s,e = line.split("\\t")
        if f:
            OUT.write('{}.{}\\t{}\\t{}\\t{}\\t{}\\n'. format(t, v, tags['${DB}'], f, s, e))
Пример #31
0
 def __init__(self):
     self.__server = BiomartServer("http://www.ensembl.org/biomart")
Пример #32
0
from biomart import BiomartServer
import json

server = BiomartServer('http://biomart.vectorbase.org/biomart')

steph = server.datasets['astephensi_eg_gene']

response = steph.search({'attributes': ['ensembl_gene_id']})

genes = []
for line in response.iter_lines():
    line = line.decode('utf-8')
    genes.append(line)

with open('./data/genes.json', 'w') as file:
    json.dump(genes, file)
Пример #33
0
from biomart import BiomartServer
from Clean import clean

server = BiomartServer( "http://biomart.vectorbase.org/biomart/" )
#'vb_gene_mart_1708': VectorBase Genes database
#'agambiae_eg_gene': Anopheles gambiae genes (AgamP4) dataset
#'agambiae_eg_genomic_sequence': Anopheles gambiae sequences (AgamP4)
# show server databases 'server.show_databases()'
# show server datasets 'server.show_datasets()'
# show all available filters and attributes of the 'uniprot' dataset:
# 'uniprot.show_filters()','uniprot.show_attributes()'
#use 'a_gambiae' dataset
#'ensembl_gene_id': 'Gene stable ID', 'strand': 'Strand',
#'start_position': 'Gene start (bp)', 'end_position': 'Gene end (bp)','chromosome_name': 'Chromosome/scaffold name'
def vb_import(ds):
    r = server.datasets[ds].search({
        'filters': {},
      'attributes': [ 'chromosome_name', 'start_position', 'end_position', 'strand', 'ensembl_gene_id' ]
    }, header = 1 )
    file = open('test_response.txt', 'r')
    # response format is TSV
    for line in r.iter_lines():
        line = line.decode('utf-8')
        print(line)
    file.close()
#vb_import('agambiae_eg_gene')
abc = server.datasets['agambiae_eg_gene']
#abc.show_attributes()
Пример #34
0
import getopt, sys
from biomart import BiomartServer
from io import StringIO
from os import listdir
from os import walk
import glob
import pandas as pd

server = BiomartServer('http://www.ensembl.org/biomart')
hsap = server.datasets['hsapiens_gene_ensembl']


def strand_fix(i):
    if i == 1:
        return ('+')
    elif i == -1:
        return ('-')
    else:
        return ('invalid strand')


def get_bed_data(extGeneNames):
    response = hsap.search(
        {
            'filters': {
                'external_gene_name': extGeneNames
            },
            'attributes': [
                "chromosome_name", "start_position", "end_position", "strand",
                "external_gene_name"
            ]