def __init__( self, host: str = 'http://www.ensembl.org', mart: str = 'ENSEMBL_MART_ENSEMBL', from_dataset: str = 'hsapiens_gene_ensembl', to_dataset: str = 'mmusculus_gene_ensembl', from_filters: str = 'hgnc_symbol', from_attributes: list = ['hgnc_symbol'], from_values: list = ['TP53', 'TERT'], to_attributes: list = ['external_gene_name'], to_homolog_attribute: str = 'mmusculus_homolog_ensembl_gene', from_gene_id_name: str = 'human_ensembl_gene_id', to_gene_id_name: str = 'mouse_ensembl_gene_id', chunk_size: int = 300 ): # connect to server self.server = Server(host=host) self.ensembl_from = (self.server.marts[mart].datasets[from_dataset]) self.ensembl_to = (self.server.marts[mart].datasets[to_dataset]) # save parameters self.from_filters = from_filters self.from_values = from_values self.to_attributes = to_attributes self.to_homolog_attribute = to_homolog_attribute self.from_gene_id_name = from_gene_id_name self.to_gene_id_name = to_gene_id_name self.from_attributes = from_attributes self.chunk_size = chunk_size
def load_mapping_collection(): server = Server(host='http://www.ensembl.org') mart = server['ENSEMBL_MART_ENSEMBL'] mouse_dataset = mart['mmusculus_gene_ensembl'] human_dataset = mart['hsapiens_gene_ensembl'] human_table = human_dataset.query( attributes=['ensembl_transcript_id', 'ensembl_peptide_id']) mouse_table = mouse_dataset.query( attributes=['ensembl_transcript_id', 'ensembl_peptide_id']) tables = pd.concat([human_table, mouse_table]) tables.rename(columns={ "Transcript stable ID": "transcript_id", "Protein stable ID": "protein_id" }, inplace=True) tables.dropna(how="any", inplace=True) mapping_collection = {} for index, row in tables.iterrows(): prot_id = row["protein_id"] trans_id = row["transcript_id"] mapping_collection[prot_id] = trans_id return mapping_collection
def mock_mart(mocker, server_marts_response): """Returns an example mart, built using a cached response.""" mocker.patch.object(Server, 'get', return_value=server_marts_response) server = Server(host='http://www.ensembl.org') return server['ENSEMBL_MART_ENSEMBL']
def biomart_bed(species): """This function returns bed objects from biomart data for the exon endings and startings separately""" #Retrieve data from biomart server = Server(host='http://www.ensembl.org') dataset = (server.marts['ENSEMBL_MART_ENSEMBL'].datasets[species + '_gene_ensembl']) result = dataset.query(attributes=[ 'chromosome_name', 'exon_chrom_start', 'exon_chrom_end', 'external_gene_name', 'ensembl_gene_id' ]) result.columns = [ 'Chromosome', 'Exon_start', 'Exon_end', 'Gene_name', 'Ensmbl_ID' ] #Create both biomart df w/o duplicates (exon end and exon start) biomart_take_off = result[[ 'Chromosome', 'Exon_end', 'Exon_end', 'Gene_name', 'Ensmbl_ID' ]].drop_duplicates() biomart_landing = result[[ 'Chromosome', 'Exon_start', 'Exon_start', 'Gene_name', 'Ensmbl_ID' ]].drop_duplicates() #Create biomart bed objects sorted (very important) biomart_take_off_bed = pybedtools.BedTool.from_dataframe( biomart_take_off).sort() biomart_landing_bed = pybedtools.BedTool.from_dataframe( biomart_landing).sort() return biomart_take_off_bed, biomart_landing_bed
def _parse_gexp_data(self): """ Parse gene expression data :return: """ # extract gene expression data gene_exp = dict() for file_path in tqdm.tqdm( glob.glob(os.path.join(self.data_path, '*', '*.FPKM.txt.gz'))): parts = file_path.split('/') f_name = parts[-1] exp_dict = dict() with gzip.open(file_path, 'rt', newline='') as f: reader = csv.reader(f, delimiter='\t') for gene_id, expression in reader: exp_dict[gene_id] = float(expression) gene_exp[f_name] = exp_dict print('Extract and sort Ensembl gene ids...') all_genes = [list(exp.keys()) for k, exp in gene_exp.items()] all_genes = [item for sublist in all_genes for item in sublist] all_genes = list(set(all_genes)) all_genes.sort() # map ensemble identifiers to gene names server = Server(host='http://www.ensembl.org') ensembl_dataset = (server.marts['ENSEMBL_MART_ENSEMBL']. datasets['hsapiens_gene_ensembl']) results = ensembl_dataset.query( attributes=['ensembl_gene_id', 'external_gene_name']) ens_dict = dict(zip(results['Gene stable ID'], results['Gene name'])) # map ensemble genes to gene names all_genes = [ ens_id for ens_id in all_genes if ens_id.split('.')[0] in ens_dict ] # create sorted gene expression dictionary sorted_gene_exp = dict() for f_name, exp_dict in tqdm.tqdm(gene_exp.items()): get_gene_exp = lambda k: exp_dict[k] if k in exp_dict else None new_dict = OrderedDict({ ens_dict[gene_id.split('.')[0]]: get_gene_exp(gene_id) for gene_id in all_genes }) sorted_gene_exp[f_name] = new_dict converted_gene_exp = pd.DataFrame.from_dict(sorted_gene_exp, orient='index') return converted_gene_exp
def get_ensembl_dict(): """ Create BioMart server and get Ensembl id mapping dict :return: """ # map ensemble identifiers to gene names server = Server(host='http://www.ensembl.org') ensembl_dataset = ( server.marts['ENSEMBL_MART_ENSEMBL'].datasets['hsapiens_gene_ensembl']) results = ensembl_dataset.query( attributes=['ensembl_gene_id', 'external_gene_name']) ens_dict = dict(zip(results['Gene stable ID'], results['Gene name'])) return ens_dict
def fetch_data(self): data = pd.DataFrame() server = Server(host="http://www.ensembl.org") datasets = self.species_to_dataset["dataset"].tolist() for dataset_name in datasets: dataset = server.marts["ENSEMBL_MART_ENSEMBL"].datasets[ dataset_name] dataset_df = dataset.query(attributes=[ "ensembl_gene_id", "entrezgene_id", "description", "external_gene_name", ]) data = pd.concat([data, dataset_df], axis=0) self.data = data
def simple_query( org: str, attrs: Union[Iterable[str], str], *, filters: Optional[Dict[str, Any]] = None, host: str = "www.ensembl.org", use_cache: bool = False, ) -> pd.DataFrame: """\ A simple interface to biomart. Params ------ {doc_org} attrs What you want returned. filters What you want to pick out. {doc_host} {doc_use_cache} """ if isinstance(attrs, str): attrs = [attrs] elif isinstance(attrs, cabc.Iterable): attrs = list(attrs) else: raise TypeError( f"attrs must be of type list or str, was {type(attrs)}." ) try: from pybiomart import Server except ImportError: raise ImportError( "This method requires the `pybiomart` module to be installed." ) server = Server(host, use_cache=use_cache) dataset = server.marts["ENSEMBL_MART_ENSEMBL"].datasets[ "{}_gene_ensembl".format(org) ] res = dataset.query(attributes=attrs, filters=filters, use_attr_names=True) return res
def get_biomart(species, meta): tmp_host = 'http://asia.ensembl.org' server = Server(host=tmp_host) query_set = None try: dataset = Dataset(name=species, host=tmp_host) if meta: query_set = dataset.query(attributes=[ 'ensembl_gene_id', 'external_gene_name', 'description', 'uniprotswissprot', 'kegg_enzyme', 'metacyc' ]) else: query_set = dataset.query(attributes=[ 'ensembl_gene_id', 'external_gene_name', 'description', 'uniprotswissprot', 'kegg_enzyme' ]) except IndexError: mart = server['ENSEMBL_MART_ENSEMBL'] print('Invalid dataset in BioMart') print(mart.list_datasets()) return query_set
def get_human_genes(db_info): """Connect to the Ensembl database and get the human gene dataset. Keep only required fields. Args: db_info: RuntimeConfig object with database info """ reference = db_info.ref_genome_path server = Server(host=reference) dataset = ( server.marts['ENSEMBL_MART_ENSEMBL'].datasets['hsapiens_gene_ensembl']) # Create list of human chromosomes. # Use this to filter out gene patches chromosome_filters = [str(x) for x in range(1, 23)] chromosome_filters.extend(['X', 'Y']) genes = dataset.query(attributes=[ 'hgnc_symbol', 'chromosome_name', 'start_position', 'end_position' ], filters={'chromosome_name': chromosome_filters}) return genes
domain_sources = [ 'TIGRFAM_domain', 'SMART_domains', 'PROSITE_profiles', 'Prints_domain', 'Pfam_domain' ] ps1data = collections.namedtuple( 'ps1data', 'Index ps1 ps1_caution pm5 pm5_caution domains') ppdata = collections.namedtuple('ppdata', 'Index pp2 pp2_caution pp3 pp3_caution') ppsift_min = 0.7 cadd_min = 15 consens_min = 2 af = 0.001 server = Server(host='grch37.ensembl.org', use_cache=True) dataset_snp = (server.marts['ENSEMBL_MART_SNP'].datasets['hsapiens_snp']) dataset_ens = ( server.marts['ENSEMBL_MART_ENSEMBL'].datasets['hsapiens_gene_ensembl']) gene_data = [] ########## # functions ########## def create_connection(db_file): """ create a database connection to the SQLite database specified by the db_file :param db_file: database file
def __init__(self, filters): self.filters = filters server_grch37 = Server(host=self.BIOMART_SERVER_URL_GRCH37) self.dataset_grch37 = server_grch37.marts[self.MART].datasets[self.DATASET] server_grch38 = Server(host=self.BIOMART_SERVER_URL_GRCH38) self.dataset_grch38 = server_grch38.marts[self.MART].datasets[self.DATASET]
from pybiomart import Server server = Server(host='http://www.ensembl.org') server.list_marts() mart = server['ENSEMBL_MART_ENSEMBL'] from pybiomart import Dataset dataset = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org') dataset.query(attributes=['ensembl_gene_id', 'external_gene_name'], filters={'chromosome_name': ['1', '2']}) def attributes(self): if self._attributes is None: self._filters, self._attributes = self._fetch_configuration() return self._attributes dataset.attributes dataset.list_attributes() def filters(self): if self._filters is None: self._filters, self._attributes = self._fetch_configuration() return self._filters dataset.filters dataset.list_filters()
from pybiomart import Dataset, Server import pandas as pd import logging input_file = "D:/Users/Polina/3Dpredictor/input/K562/RNA-seq/rna-seqPolyA.tsv" inp_data = pd.read_csv(input_file, delimiter="\t") gene_id_field = 'gene_id' inp_data["Gene_ID"] = inp_data[gene_id_field].apply(lambda x: x.split(".")[0]) server = Server(host='http://grch37.ensembl.org') mart = server['ENSEMBL_MART_ENSEMBL'] dataset = mart["hsapiens_gene_ensembl"] query = dataset.query(attributes=[ 'ensembl_gene_id', 'start_position', 'end_position', 'external_gene_name', 'chromosome_name' ], ) FinalData = pd.merge(left=inp_data, right=query, how="inner", left_on="Gene_ID", right_on="Gene stable ID", validate="1:1") if len(FinalData) != len(inp_data): logging.getLogger(__name__).warning("Some data missing in Ensemble, " + str(len(inp_data) - len(FinalData)) + " out of " + str(len(inp_data))) FinalData.to_csv(input_file + "pre.txt", sep="\t", index=False)
def get_bin_gene_region_df(bin_size, chr_stops, region_stops, excluded_bins): """ Creates a pands.DataFrame with the gene and region corresponding to each bin :param bin_size: number of base pairs in a bin :param chromosome_stops: dictionary indicating final unfiltered bin of each chromosome :param region_stops: df indicating the final filtered bin of each region :param excluded_bins: list of excluded bins :return: DataFrame of (gene, chr, region, filtered_bin) """ # Pull genome annotations using pybiomart server = Server("www.ensembl.org", use_cache=False) dataset = server.marts["ENSEMBL_MART_ENSEMBL"].datasets[ "hsapiens_gene_ensembl"] gene_coordinates = dataset.query(attributes=[ "chromosome_name", "start_position", "end_position", "external_gene_name" ], filters={ 'chromosome_name': [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 'X', 'Y' ] }, use_attr_names=True) bin_gene_region_df = pd.DataFrame(index=range(chr_stops['Y'] + 1)) chr_stops_df = pd.DataFrame({ 'chr': list(chr_stops.keys()), 'stop': list(chr_stops.values()) }) bin_gene_region_df["region"] = None bin_gene_region_df["gene"] = [ list() for _ in range(bin_gene_region_df.shape[0]) ] bin_gene_region_df["chr"] = [ list() for _ in range(bin_gene_region_df.shape[0]) ] # for each gene for index, row in gene_coordinates.iterrows(): start_bin = int(row["start_position"] / bin_size) stop_bin = int(row["end_position"] / bin_size) chromosome = str(row["chromosome_name"]) if chromosome != "1": # coordinates are given by chromosome chr_start = (chr_stops_df.iloc[np.where( chr_stops_df["chr"] == chromosome)[0][0] - 1].stop + 1) start_bin = start_bin + chr_start stop_bin = stop_bin + chr_start gene = row["external_gene_name"] for bin in range(start_bin, stop_bin + 1): bin_gene_region_df.loc[bin, "gene"].append(gene) bin_gene_region_df.loc[bin, "chr"].append(chromosome) # Turn columns of lists into columns of strings with comma-separated values bin_gene_region_df["gene"] = [ ",".join(map(str, l)) for l in bin_gene_region_df["gene"] ] bin_gene_region_df["chr"] = [ ",".join(map(str, l)) for l in bin_gene_region_df["chr"] ] # Indicate original_bin-filtered_bin correspondence bin_is_excluded = np.zeros(list(chr_stops.values())[-1] + 1) bin_is_excluded[excluded_bins] = 1 bin_is_excluded = bin_is_excluded.astype(bool) bin_gene_region_df["filtered_bin"] = None bin_gene_region_df["filtered_bin"].iloc[np.where( np.array(bin_is_excluded) == False)[0]] = np.arange( np.count_nonzero(np.array(bin_is_excluded) == False)) # Get the regions region_stops = list(region_stops) last_bin = np.count_nonzero(np.array(bin_is_excluded) == False) - 1 region_stops.append(last_bin) start_bin = 0 for i, stop_bin in enumerate(region_stops): original_start_bin = np.where( bin_gene_region_df["filtered_bin"] == start_bin)[0][0] original_stop_bin = np.where( bin_gene_region_df["filtered_bin"] == stop_bin)[0][0] bin_gene_region_df.loc[original_start_bin:original_stop_bin + 1, "region"] = i # regions are 0 indexed start_bin = stop_bin return bin_gene_region_df
#!/data/shared_env/bin/python3 from pybiomart import Server import pandas as pd import sys snp_list_file = sys.argv[1] output_file_name = sys.argv[2] snp_list = list(set(line.strip() for line in open(snp_list_file))) server = Server(host='http://www.ensembl.org') snpMart = server.marts['ENSEMBL_MART_SNP'].datasets['hsapiens_snp'] def query_ensembl(snp_list): query_result = snpMart.query(attributes=[ 'refsnp_id', 'associated_variant_risk_allele', 'source_name', 'clinical_significance', 'phenotype_description', 'pmid' ], filters={'snp_filter': snp_list}) return (query_result) output_data = pd.DataFrame(columns=[ 'Variant name', 'Associated variant risk allele', 'Source name', 'Clinical significance', 'Phenotype description', 'PubMed ID' ]) for i in range(0, len(snp_list)): print(snp_list[i]) query_result = query_ensembl(snp_list[i]) output_data = output_data.append(query_result)
import shutil import subprocess import warnings warnings.filterwarnings('ignore') bin_dir = path.dirname(sys.executable) os.environ['PATH'] += os.pathsep + bin_dir shutil.which('bedtools') subprocess.run(['bedtools', '--help']) #Read input file CIViC_variants = pd.read_csv(sys.argv[1]) #Connect to the Ensembl BioMart Server server = Server(host='grch37.ensembl.org') dataset = ( server.marts['ENSEMBL_MART_ENSEMBL'].datasets['hsapiens_gene_ensembl']) #Create list to link ESNG to HUGO gene name matching_list = dataset.query( attributes=['ensembl_gene_id', 'external_gene_name']).drop_duplicates() #Create function to create probes for variants of different lengths def create_probe_list(CIViC_variants): sparse_tile_variant_types = ['LOSS', 'AMPLIFICATION', 'DELETION'] civic_coordinate_variants = [ 'DNA BINDING DOMAIN MUTATION', 'PROMOTER DEMETHYLATION', 'CONSERVED DOMAIN MUT' ]