Exemplo n.º 1
0
    def __init__(
          self,
          host: str = 'http://www.ensembl.org',
          mart: str = 'ENSEMBL_MART_ENSEMBL',
          from_dataset: str = 'hsapiens_gene_ensembl',
          to_dataset: str = 'mmusculus_gene_ensembl',
          from_filters: str = 'hgnc_symbol',
          from_attributes: list = ['hgnc_symbol'],
          from_values: list = ['TP53', 'TERT'],
          to_attributes: list = ['external_gene_name'],
          to_homolog_attribute: str = 'mmusculus_homolog_ensembl_gene',
          from_gene_id_name: str = 'human_ensembl_gene_id',
          to_gene_id_name: str = 'mouse_ensembl_gene_id',
          chunk_size: int = 300
    ):

        # connect to server
        self.server = Server(host=host)
        self.ensembl_from = (self.server.marts[mart].datasets[from_dataset])
        self.ensembl_to = (self.server.marts[mart].datasets[to_dataset])

        # save parameters
        self.from_filters = from_filters
        self.from_values = from_values
        self.to_attributes = to_attributes
        self.to_homolog_attribute = to_homolog_attribute
        self.from_gene_id_name = from_gene_id_name
        self.to_gene_id_name = to_gene_id_name
        self.from_attributes = from_attributes
        self.chunk_size = chunk_size
Exemplo n.º 2
0
def load_mapping_collection():

    server = Server(host='http://www.ensembl.org')
    mart = server['ENSEMBL_MART_ENSEMBL']
    mouse_dataset = mart['mmusculus_gene_ensembl']
    human_dataset = mart['hsapiens_gene_ensembl']

    human_table = human_dataset.query(
        attributes=['ensembl_transcript_id', 'ensembl_peptide_id'])
    mouse_table = mouse_dataset.query(
        attributes=['ensembl_transcript_id', 'ensembl_peptide_id'])
    tables = pd.concat([human_table, mouse_table])
    tables.rename(columns={
        "Transcript stable ID": "transcript_id",
        "Protein stable ID": "protein_id"
    },
                  inplace=True)
    tables.dropna(how="any", inplace=True)

    mapping_collection = {}

    for index, row in tables.iterrows():
        prot_id = row["protein_id"]
        trans_id = row["transcript_id"]
        mapping_collection[prot_id] = trans_id

    return mapping_collection
Exemplo n.º 3
0
def mock_mart(mocker, server_marts_response):
    """Returns an example mart, built using a cached response."""

    mocker.patch.object(Server, 'get', return_value=server_marts_response)

    server = Server(host='http://www.ensembl.org')
    return server['ENSEMBL_MART_ENSEMBL']
Exemplo n.º 4
0
def biomart_bed(species):
    """This function returns bed objects from biomart data
    for the exon endings and startings separately"""

    #Retrieve data from biomart
    server = Server(host='http://www.ensembl.org')
    dataset = (server.marts['ENSEMBL_MART_ENSEMBL'].datasets[species +
                                                             '_gene_ensembl'])
    result = dataset.query(attributes=[
        'chromosome_name', 'exon_chrom_start', 'exon_chrom_end',
        'external_gene_name', 'ensembl_gene_id'
    ])
    result.columns = [
        'Chromosome', 'Exon_start', 'Exon_end', 'Gene_name', 'Ensmbl_ID'
    ]

    #Create both biomart df w/o duplicates (exon end and exon start)
    biomart_take_off = result[[
        'Chromosome', 'Exon_end', 'Exon_end', 'Gene_name', 'Ensmbl_ID'
    ]].drop_duplicates()
    biomart_landing = result[[
        'Chromosome', 'Exon_start', 'Exon_start', 'Gene_name', 'Ensmbl_ID'
    ]].drop_duplicates()

    #Create biomart bed objects sorted (very important)
    biomart_take_off_bed = pybedtools.BedTool.from_dataframe(
        biomart_take_off).sort()
    biomart_landing_bed = pybedtools.BedTool.from_dataframe(
        biomart_landing).sort()

    return biomart_take_off_bed, biomart_landing_bed
Exemplo n.º 5
0
    def _parse_gexp_data(self):
        """
        Parse gene expression data
        :return:
        """
        # extract gene expression data
        gene_exp = dict()

        for file_path in tqdm.tqdm(
                glob.glob(os.path.join(self.data_path, '*', '*.FPKM.txt.gz'))):
            parts = file_path.split('/')
            f_name = parts[-1]
            exp_dict = dict()
            with gzip.open(file_path, 'rt', newline='') as f:
                reader = csv.reader(f, delimiter='\t')
                for gene_id, expression in reader:
                    exp_dict[gene_id] = float(expression)
            gene_exp[f_name] = exp_dict

        print('Extract and sort Ensembl gene ids...')
        all_genes = [list(exp.keys()) for k, exp in gene_exp.items()]
        all_genes = [item for sublist in all_genes for item in sublist]
        all_genes = list(set(all_genes))
        all_genes.sort()

        # map ensemble identifiers to gene names
        server = Server(host='http://www.ensembl.org')

        ensembl_dataset = (server.marts['ENSEMBL_MART_ENSEMBL'].
                           datasets['hsapiens_gene_ensembl'])

        results = ensembl_dataset.query(
            attributes=['ensembl_gene_id', 'external_gene_name'])

        ens_dict = dict(zip(results['Gene stable ID'], results['Gene name']))

        # map ensemble genes to gene names
        all_genes = [
            ens_id for ens_id in all_genes if ens_id.split('.')[0] in ens_dict
        ]

        # create sorted gene expression dictionary
        sorted_gene_exp = dict()

        for f_name, exp_dict in tqdm.tqdm(gene_exp.items()):
            get_gene_exp = lambda k: exp_dict[k] if k in exp_dict else None
            new_dict = OrderedDict({
                ens_dict[gene_id.split('.')[0]]: get_gene_exp(gene_id)
                for gene_id in all_genes
            })
            sorted_gene_exp[f_name] = new_dict

        converted_gene_exp = pd.DataFrame.from_dict(sorted_gene_exp,
                                                    orient='index')

        return converted_gene_exp
Exemplo n.º 6
0
def get_ensembl_dict():
    """
    Create BioMart server and get Ensembl id mapping dict
    :return:
    """
    # map ensemble identifiers to gene names
    server = Server(host='http://www.ensembl.org')

    ensembl_dataset = (
        server.marts['ENSEMBL_MART_ENSEMBL'].datasets['hsapiens_gene_ensembl'])

    results = ensembl_dataset.query(
        attributes=['ensembl_gene_id', 'external_gene_name'])

    ens_dict = dict(zip(results['Gene stable ID'], results['Gene name']))

    return ens_dict
Exemplo n.º 7
0
    def fetch_data(self):
        data = pd.DataFrame()

        server = Server(host="http://www.ensembl.org")

        datasets = self.species_to_dataset["dataset"].tolist()
        for dataset_name in datasets:
            dataset = server.marts["ENSEMBL_MART_ENSEMBL"].datasets[
                dataset_name]
            dataset_df = dataset.query(attributes=[
                "ensembl_gene_id",
                "entrezgene_id",
                "description",
                "external_gene_name",
            ])

            data = pd.concat([data, dataset_df], axis=0)

        self.data = data
Exemplo n.º 8
0
def simple_query(
    org: str,
    attrs: Union[Iterable[str], str],
    *,
    filters: Optional[Dict[str, Any]] = None,
    host: str = "www.ensembl.org",
    use_cache: bool = False,
) -> pd.DataFrame:
    """\
    A simple interface to biomart.

    Params
    ------
    {doc_org}
    attrs
        What you want returned.
    filters
        What you want to pick out.
    {doc_host}
    {doc_use_cache}
    """
    if isinstance(attrs, str):
        attrs = [attrs]
    elif isinstance(attrs, cabc.Iterable):
        attrs = list(attrs)
    else:
        raise TypeError(
            f"attrs must be of type list or str, was {type(attrs)}."
        )
    try:
        from pybiomart import Server
    except ImportError:
        raise ImportError(
            "This method requires the `pybiomart` module to be installed."
        )
    server = Server(host, use_cache=use_cache)
    dataset = server.marts["ENSEMBL_MART_ENSEMBL"].datasets[
        "{}_gene_ensembl".format(org)
    ]
    res = dataset.query(attributes=attrs, filters=filters, use_attr_names=True)
    return res
Exemplo n.º 9
0
def get_biomart(species, meta):
    tmp_host = 'http://asia.ensembl.org'
    server = Server(host=tmp_host)
    query_set = None
    try:
        dataset = Dataset(name=species, host=tmp_host)
        if meta:
            query_set = dataset.query(attributes=[
                'ensembl_gene_id', 'external_gene_name', 'description',
                'uniprotswissprot', 'kegg_enzyme', 'metacyc'
            ])
        else:
            query_set = dataset.query(attributes=[
                'ensembl_gene_id', 'external_gene_name', 'description',
                'uniprotswissprot', 'kegg_enzyme'
            ])
    except IndexError:
        mart = server['ENSEMBL_MART_ENSEMBL']
        print('Invalid dataset in BioMart')
        print(mart.list_datasets())
    return query_set
Exemplo n.º 10
0
def get_human_genes(db_info):
    """Connect to the Ensembl database and get the human gene dataset.
        Keep only required fields.

    Args:
        db_info: RuntimeConfig object with database info
    """
    reference = db_info.ref_genome_path
    server = Server(host=reference)
    dataset = (
        server.marts['ENSEMBL_MART_ENSEMBL'].datasets['hsapiens_gene_ensembl'])
    # Create list of human chromosomes.
    # Use this to filter out gene patches
    chromosome_filters = [str(x) for x in range(1, 23)]
    chromosome_filters.extend(['X', 'Y'])

    genes = dataset.query(attributes=[
        'hgnc_symbol', 'chromosome_name', 'start_position', 'end_position'
    ],
                          filters={'chromosome_name': chromosome_filters})
    return genes
Exemplo n.º 11
0
domain_sources = [
    'TIGRFAM_domain', 'SMART_domains', 'PROSITE_profiles', 'Prints_domain',
    'Pfam_domain'
]
ps1data = collections.namedtuple(
    'ps1data', 'Index ps1 ps1_caution pm5 pm5_caution domains')
ppdata = collections.namedtuple('ppdata',
                                'Index pp2 pp2_caution pp3 pp3_caution')

ppsift_min = 0.7
cadd_min = 15
consens_min = 2
af = 0.001

server = Server(host='grch37.ensembl.org', use_cache=True)
dataset_snp = (server.marts['ENSEMBL_MART_SNP'].datasets['hsapiens_snp'])
dataset_ens = (
    server.marts['ENSEMBL_MART_ENSEMBL'].datasets['hsapiens_gene_ensembl'])

gene_data = []

##########
# functions
##########


def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by the db_file
    :param db_file: database file
Exemplo n.º 12
0
 def __init__(self, filters):
     self.filters = filters
     server_grch37 = Server(host=self.BIOMART_SERVER_URL_GRCH37)
     self.dataset_grch37 = server_grch37.marts[self.MART].datasets[self.DATASET]
     server_grch38 = Server(host=self.BIOMART_SERVER_URL_GRCH38)
     self.dataset_grch38 = server_grch38.marts[self.MART].datasets[self.DATASET]
Exemplo n.º 13
0
from pybiomart import Server
server = Server(host='http://www.ensembl.org')
server.list_marts()
mart = server['ENSEMBL_MART_ENSEMBL']

from pybiomart import Dataset
dataset = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org')

dataset.query(attributes=['ensembl_gene_id', 'external_gene_name'],
              filters={'chromosome_name': ['1', '2']})


def attributes(self):

    if self._attributes is None:
        self._filters, self._attributes = self._fetch_configuration()
    return self._attributes


dataset.attributes
dataset.list_attributes()


def filters(self):
    if self._filters is None:
        self._filters, self._attributes = self._fetch_configuration()
    return self._filters


dataset.filters
dataset.list_filters()
Exemplo n.º 14
0
from pybiomart import Dataset, Server
import pandas as pd
import logging

input_file = "D:/Users/Polina/3Dpredictor/input/K562/RNA-seq/rna-seqPolyA.tsv"
inp_data = pd.read_csv(input_file, delimiter="\t")
gene_id_field = 'gene_id'
inp_data["Gene_ID"] = inp_data[gene_id_field].apply(lambda x: x.split(".")[0])

server = Server(host='http://grch37.ensembl.org')
mart = server['ENSEMBL_MART_ENSEMBL']
dataset = mart["hsapiens_gene_ensembl"]

query = dataset.query(attributes=[
    'ensembl_gene_id', 'start_position', 'end_position', 'external_gene_name',
    'chromosome_name'
], )

FinalData = pd.merge(left=inp_data,
                     right=query,
                     how="inner",
                     left_on="Gene_ID",
                     right_on="Gene stable ID",
                     validate="1:1")

if len(FinalData) != len(inp_data):
    logging.getLogger(__name__).warning("Some data missing in Ensemble, " +
                                        str(len(inp_data) - len(FinalData)) +
                                        " out of " + str(len(inp_data)))

FinalData.to_csv(input_file + "pre.txt", sep="\t", index=False)
Exemplo n.º 15
0
def get_bin_gene_region_df(bin_size, chr_stops, region_stops, excluded_bins):
    """
        Creates a pands.DataFrame with the gene and region corresponding to each bin
        :param bin_size: number of base pairs in a bin
        :param chromosome_stops: dictionary indicating final unfiltered bin of each chromosome
        :param region_stops: df indicating the final filtered bin of each region
        :param excluded_bins: list of excluded bins
        :return: DataFrame of (gene, chr, region, filtered_bin)
    """

    # Pull genome annotations using pybiomart
    server = Server("www.ensembl.org", use_cache=False)
    dataset = server.marts["ENSEMBL_MART_ENSEMBL"].datasets[
        "hsapiens_gene_ensembl"]
    gene_coordinates = dataset.query(attributes=[
        "chromosome_name", "start_position", "end_position",
        "external_gene_name"
    ],
                                     filters={
                                         'chromosome_name': [
                                             1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                                             12, 13, 14, 15, 16, 17, 18, 19,
                                             20, 21, 22, 'X', 'Y'
                                         ]
                                     },
                                     use_attr_names=True)

    bin_gene_region_df = pd.DataFrame(index=range(chr_stops['Y'] + 1))
    chr_stops_df = pd.DataFrame({
        'chr': list(chr_stops.keys()),
        'stop': list(chr_stops.values())
    })

    bin_gene_region_df["region"] = None
    bin_gene_region_df["gene"] = [
        list() for _ in range(bin_gene_region_df.shape[0])
    ]
    bin_gene_region_df["chr"] = [
        list() for _ in range(bin_gene_region_df.shape[0])
    ]

    # for each gene
    for index, row in gene_coordinates.iterrows():
        start_bin = int(row["start_position"] / bin_size)
        stop_bin = int(row["end_position"] / bin_size)
        chromosome = str(row["chromosome_name"])

        if chromosome != "1":  # coordinates are given by chromosome
            chr_start = (chr_stops_df.iloc[np.where(
                chr_stops_df["chr"] == chromosome)[0][0] - 1].stop + 1)
            start_bin = start_bin + chr_start
            stop_bin = stop_bin + chr_start

        gene = row["external_gene_name"]
        for bin in range(start_bin, stop_bin + 1):
            bin_gene_region_df.loc[bin, "gene"].append(gene)
            bin_gene_region_df.loc[bin, "chr"].append(chromosome)

    # Turn columns of lists into columns of strings with comma-separated values
    bin_gene_region_df["gene"] = [
        ",".join(map(str, l)) for l in bin_gene_region_df["gene"]
    ]
    bin_gene_region_df["chr"] = [
        ",".join(map(str, l)) for l in bin_gene_region_df["chr"]
    ]

    # Indicate original_bin-filtered_bin correspondence
    bin_is_excluded = np.zeros(list(chr_stops.values())[-1] + 1)
    bin_is_excluded[excluded_bins] = 1
    bin_is_excluded = bin_is_excluded.astype(bool)
    bin_gene_region_df["filtered_bin"] = None
    bin_gene_region_df["filtered_bin"].iloc[np.where(
        np.array(bin_is_excluded) == False)[0]] = np.arange(
            np.count_nonzero(np.array(bin_is_excluded) == False))

    # Get the regions
    region_stops = list(region_stops)
    last_bin = np.count_nonzero(np.array(bin_is_excluded) == False) - 1
    region_stops.append(last_bin)
    start_bin = 0
    for i, stop_bin in enumerate(region_stops):
        original_start_bin = np.where(
            bin_gene_region_df["filtered_bin"] == start_bin)[0][0]
        original_stop_bin = np.where(
            bin_gene_region_df["filtered_bin"] == stop_bin)[0][0]
        bin_gene_region_df.loc[original_start_bin:original_stop_bin + 1,
                               "region"] = i  # regions are 0 indexed
        start_bin = stop_bin

    return bin_gene_region_df
Exemplo n.º 16
0
#!/data/shared_env/bin/python3
from pybiomart import Server
import pandas as pd
import sys

snp_list_file = sys.argv[1]
output_file_name = sys.argv[2]

snp_list = list(set(line.strip() for line in open(snp_list_file)))

server = Server(host='http://www.ensembl.org')
snpMart = server.marts['ENSEMBL_MART_SNP'].datasets['hsapiens_snp']


def query_ensembl(snp_list):
    query_result = snpMart.query(attributes=[
        'refsnp_id', 'associated_variant_risk_allele', 'source_name',
        'clinical_significance', 'phenotype_description', 'pmid'
    ],
                                 filters={'snp_filter': snp_list})
    return (query_result)


output_data = pd.DataFrame(columns=[
    'Variant name', 'Associated variant risk allele', 'Source name',
    'Clinical significance', 'Phenotype description', 'PubMed ID'
])
for i in range(0, len(snp_list)):
    print(snp_list[i])
    query_result = query_ensembl(snp_list[i])
    output_data = output_data.append(query_result)
Exemplo n.º 17
0
import shutil
import subprocess
import warnings

warnings.filterwarnings('ignore')

bin_dir = path.dirname(sys.executable)
os.environ['PATH'] += os.pathsep + bin_dir
shutil.which('bedtools')
subprocess.run(['bedtools', '--help'])

#Read input file
CIViC_variants = pd.read_csv(sys.argv[1])

#Connect to the Ensembl BioMart Server
server = Server(host='grch37.ensembl.org')
dataset = (
    server.marts['ENSEMBL_MART_ENSEMBL'].datasets['hsapiens_gene_ensembl'])

#Create list to link ESNG to HUGO gene name
matching_list = dataset.query(
    attributes=['ensembl_gene_id', 'external_gene_name']).drop_duplicates()


#Create function to create probes for variants of different lengths
def create_probe_list(CIViC_variants):
    sparse_tile_variant_types = ['LOSS', 'AMPLIFICATION', 'DELETION']
    civic_coordinate_variants = [
        'DNA BINDING DOMAIN MUTATION', 'PROMOTER DEMETHYLATION',
        'CONSERVED DOMAIN MUT'
    ]