def update_drugbank_mappings(): """Update mappings from DrugBank to CHEBI/CHEMBL""" # Note that for this to work, PyOBO (https://github.com/pyobo/pyobo) has # to be installed and the DrugBank download # (https://www.drugbank.ca/releases/latest) put into ~/.obo/drugbank/ # Note that the DrugBank download requires signing up for an account and # waiting for approval. import pyobo drugbank_chembl = pyobo.get_filtered_xrefs('drugbank', 'chembl.compound') drugbank_chebi = pyobo.get_filtered_xrefs('drugbank', 'chebi') chebi_drugbank = pyobo.get_filtered_xrefs('chebi', 'drugbank') drugbank_names = pyobo.get_id_name_mapping('drugbank') rows = [] for drugbank_id, chembl_id in drugbank_chembl.items(): rows.append([drugbank_id, 'CHEMBL', chembl_id, 'drugbank']) for drugbank_id, chebi_id in drugbank_chebi.items(): rows.append([drugbank_id, 'CHEBI', chebi_id, 'drugbank']) for chebi_id, drugbank_id in chebi_drugbank.items(): rows.append([drugbank_id, 'CHEBI', chebi_id, 'chebi']) for drugbank_id, name in drugbank_names.items(): rows.append([drugbank_id, 'NAME', name, 'drugbank']) fname = os.path.join(path, 'drugbank_mappings.tsv') header = ['DRUGBANK_ID', 'NAMESPACE', 'ID', 'SOURCE'] rows = [header] + sorted(rows) write_unicode_csv(fname, rows, delimiter='\t')
def get_gene_associations_df(identifier: str, *, rows: Optional[int] = None) -> pd.DataFrame: """Get gene associations for the given GO identifier as a dataframe. - filtered for human onlay - filtered for proteins only - add HGNC identifier and entrez identifier """ associations = get_gene_associations(identifier, rows=rows) df = pd.DataFrame( [ ( e['subject']['id'], e['subject']['label'], e['subject']['taxon']['id'][len('NCBITaxon:'):], e['object']['id'], e['object']['label'], e['negated'], # e['relation']['category'], # e['relation']['id'], # e['relation']['inverse'], # e['relation']['label'], # e['subject_extensions'], ) for e in associations ], columns=[ 'source_id', 'source_name', 'taxonomy_id', 'target_id', 'target_label', 'negated', # 'relation_category', # 'relation_id', # 'relation_inverse', # 'relation_label', # 'subject_extensions', ], ) df = df[df['taxonomy_id'] == '9606'] df = df[df['source_id'].str.startswith('UniProtKB:')] df['uniprot_id'] = df['source_id'].map(lambda s: s[len('UniProtKB:'):]) del df['source_id'] del df['taxonomy_id'] df['hgnc_id'] = df['uniprot_id'].map(get_hgnc_id) df = df[df['hgnc_id'].notna()] df['ncbigene_id'] = df['hgnc_id'].map(pyobo.get_filtered_xrefs('hgnc', 'ncbigene').__getitem__) df['target_id'] = df['target_id'].map(lambda s: s[len('GO:'):]) return df
def test_get_target_xrefs(self): """Test getting xrefs.""" kegg_xrefs = get_filtered_xrefs('chebi', 'kegg', url=TEST_CHEBI_OBO_PATH, local=True) print(kegg_xrefs) for key, value in kegg_xrefs.items(): self.assertFalse(key.startswith('CHEBI:')) self.assertFalse(key.startswith('CHEBI')) self.assertFalse(key.startswith('chebi:')) self.assertFalse(key.startswith('chebi')) self.assertFalse(value.startswith('KEGG:')) self.assertFalse(value.startswith('KEGG')) self.assertFalse(value.startswith('kegg:')) self.assertFalse(value.startswith('kegg')) self.assertIsInstance(kegg_xrefs, dict)
def test_get_target_xrefs(self): """Test getting xrefs.""" with chebi_patch: kegg_xrefs = get_filtered_xrefs('chebi', 'kegg') for key, value in kegg_xrefs.items(): self.assertFalse(key.startswith('CHEBI:')) self.assertFalse(key.startswith('CHEBI')) self.assertFalse(key.startswith('chebi:')) self.assertFalse(key.startswith('chebi')) self.assertFalse(value.startswith('KEGG:')) self.assertFalse(value.startswith('KEGG')) self.assertFalse(value.startswith('kegg:')) self.assertFalse(value.startswith('kegg')) self.assertIsInstance(kegg_xrefs, dict)
def test_get_target_xrefs(self): """Test getting xrefs.""" with chebi_patch: kegg_xrefs = get_filtered_xrefs("chebi", "kegg") for key, value in kegg_xrefs.items(): self.assertFalse(key.startswith("CHEBI:")) self.assertFalse(key.startswith("CHEBI")) self.assertFalse(key.startswith("chebi:")) self.assertFalse(key.startswith("chebi")) self.assertFalse(value.startswith("KEGG:")) self.assertFalse(value.startswith("KEGG")) self.assertFalse(value.startswith("kegg:")) self.assertFalse(value.startswith("kegg")) self.assertIsInstance(kegg_xrefs, dict)
def mutual_mapping_graph( prefixes: Iterable[str], skip_sources: Optional[Iterable[str]] = None, skip_targets: Optional[Iterable[str]] = None, ) -> nx.Graph: """Get the undirected mapping graph between the given prefixes. :param prefixes: A list of prefixes to use with :func:`pyobo.get_filtered_xrefs` to get xrefs. :param skip_sources: An optional list of prefixes to skip as the source for xrefs :param skip_targets: An optional list of prefixes to skip as the target for xrefs :return: The undirected mapping graph containing mappings between entries in the given namespaces. """ prefixes = sorted(prefixes) skip_sources = set() if skip_sources is None else set(skip_sources) skip_targets = set() if skip_targets is None else set(skip_targets) graph = nx.Graph() for source, target in itt.product(prefixes, repeat=2): if source == target or source in skip_sources or target in skip_targets: continue for source_id, target_id in pyobo.get_filtered_xrefs(source, target).items(): graph.add_edge((source, source_id), (target, target_id)) return graph
def populate(self, paths: Optional[Mapping[str, str]] = None): """Populate the database. :param paths: mapping from tax identifiers to paths to GMT files """ if not paths: logger.info('No paths given.') paths = {info.taxonomy_id: info.path for info in infos.values()} logger.info(f'Using default paths at {paths}.') elif not isinstance(paths, dict): raise TypeError('Invalid type for paths. Shoudl be dict.') pathways = [ pathway for taxonomy_id, path in paths.items() for pathway in parse_wikipathways_gmt(path) ] versions = { version for _identifier, version, _revision, _name, _species_name, _entries in pathways } if len(versions) != 1: raise ValueError('got multiple versions') version = list(versions)[0] taxonomy_name_to_id = get_name_id_mapping('ncbitaxon') species_names = { SPECIES_REMAPPING.get(species_name, species_name) for _identifier, _version, _revision, _name, species_name, _entries in pathways } species_name_to_species = {} for species_name in tqdm(species_names, desc=f'v{version} serializing species'): taxonomy_id = taxonomy_name_to_id[species_name] species = species_name_to_species[species_name] = Species(taxonomy_id=taxonomy_id, name=species_name) self.session.add(species) hgnc_id_to_entrez_id = get_filtered_xrefs('hgnc', 'ncbigene') if not hgnc_id_to_entrez_id: raise ValueError('Mappings from hgnc to ncbigene couldnt be loaded') entrez_id_to_hgnc_id = {v: k for k, v in hgnc_id_to_entrez_id.items()} hgnc_id_to_name = get_id_name_mapping('hgnc') missing_entrez_ids = set() entrez_ids = { entrez_id for _identifier, _version, _revision, _name, _species, entrez_ids in pathways for entrez_id in entrez_ids } entrez_id_protein = {} for entrez_id in tqdm(entrez_ids, desc=f'v{version} serializing proteins'): hgnc_id = entrez_id_to_hgnc_id.get(entrez_id) if hgnc_id: hgnc_symbol = hgnc_id_to_name[hgnc_id] else: hgnc_symbol = None if not hgnc_symbol: logging.debug(f"ncbigene:{entrez_id} has no HGNC identifier") missing_entrez_ids.add(entrez_id) entrez_id_protein[entrez_id] = protein = self.get_or_create_protein( entrez_id=entrez_id, hgnc_symbol=hgnc_symbol, hgnc_id=hgnc_id, ) self.session.add(protein) logger.info(f'Proteins: {len(entrez_id_protein)}') logger.info(f"Proteins w/o HGNC mapping: {len(missing_entrez_ids)}") for ( wikipathways_id, _version, revision, pathway_name, species_name, entrez_ids, ) in tqdm(pathways, desc=f'v{version} serializing pathways'): proteins = [ entrez_id_protein[entrez_id] for entrez_id in entrez_ids ] pathway = self.get_or_create_pathway( identifier=wikipathways_id, name=pathway_name.strip(), revision=revision, species=species_name_to_species[SPECIES_REMAPPING.get(species_name, species_name)], proteins=proteins, ) self.session.add(pathway) self.session.commit()
def _map_hgnc_to_entrez(hgnc_id): return get_filtered_xrefs('hgnc', 'ncbigene').get(hgnc_id)
import pybel import pybel.dsl from pybel import BELGraph from ..compath import CompathManager, CompathPathwayMixin, CompathProteinMixin from ..utils import get_data_dir logger = logging.getLogger(__name__) MODULE_NAME = 'pid' DIRECTORY = get_data_dir(MODULE_NAME) URL = 'https://github.com/NCIP/pathway-interaction-database/raw/master/download/NCI-Pathway-Info.xlsx' chebi_id_to_name = get_id_name_mapping('chebi') hgnc_name_to_id = get_name_id_mapping('hgnc') hgnc_id_to_entrez_id = get_filtered_xrefs('hgnc', 'ncbigene') relation_to_adder = { 'controls-state-change-of': BELGraph.add_regulates, } namespace_to_dsl = { 'cas': pybel.dsl.Abundance, 'uniprot': pybel.dsl.Protein, 'hprd': pybel.dsl.Protein, 'chebi': pybel.dsl.Abundance, 'hgnc': pybel.dsl.Protein, } UNMAPPED = set()