Exemplo n.º 1
0
def main():
    graph = get_obo_graph('chiro')
    chebi_mapping = get_id_name_mapping('chebi')
    mappings = {
        prefix: get_id_name_mapping(prefix)
        for prefix in MAPPING_PREFIXES
    }

    triples = []
    for h, data in graph.nodes(data=True):
        if not data:
            continue
        r, t = data['relationship'][0].split()
        r = r[:-len('_of')]

        h_name = chebi_mapping.get(h)
        if h_name is None:
            print(f'Could not find name for chemical {h}')
            continue

        t_namespace = t.split(':')[0].lower()
        t_mapping = mappings[t_namespace]
        t_name = t_mapping.get(t)
        if t_name is None:
            print(f'Could not find name for target {t}')
            continue

        triples.append(('chebi', h, h_name, r, t_namespace, t, t_name))

    with open('chiro_import.tsv', 'w') as file:
        print(
            'source_db	source_id	source_name	modulation	type	target_db	target_id	target_name',
            file=file)
        for t in sorted(triples):
            print(*t, sep='\t', file=file)
Exemplo n.º 2
0
def upload_artifacts_for_prefix(*, prefix: str, bucket: str, s3_client=None):
    """Upload compiled parts for the given prefix to AWS."""
    if s3_client is None:
        s3_client = boto3.client("s3")

    logger.info("[%s] getting id->name mapping", prefix)
    get_id_name_mapping(prefix)
    id_name_path = prefix_cache_join(prefix, name="names.tsv", version=get_version(prefix))
    if not id_name_path.exists():
        raise FileNotFoundError
    id_name_key = os.path.join(prefix, "cache", "names.tsv")
    logger.info("[%s] uploading id->name mapping", prefix)
    upload_file(path=id_name_path, bucket=bucket, key=id_name_key, s3_client=s3_client)

    logger.info("[%s] getting id->synonyms mapping", prefix)
    get_id_synonyms_mapping(prefix)
    id_synonyms_path = prefix_cache_join(prefix, name="synonyms.tsv", version=get_version(prefix))
    if not id_synonyms_path.exists():
        raise FileNotFoundError
    id_synonyms_key = os.path.join(prefix, "cache", "synonyms.tsv")
    logger.info("[%s] uploading id->synonyms mapping", prefix)
    upload_file(path=id_synonyms_path, bucket=bucket, key=id_synonyms_key, s3_client=s3_client)

    logger.info("[%s] getting xrefs", prefix)
    get_xrefs_df(prefix)
    xrefs_path = prefix_cache_join(prefix, name="xrefs.tsv", version=get_version(prefix))
    if not xrefs_path.exists():
        raise FileNotFoundError
    xrefs_key = os.path.join(prefix, "cache", "xrefs.tsv")
    logger.info("[%s] uploading xrefs", prefix)
    upload_file(path=xrefs_path, bucket=bucket, key=xrefs_key, s3_client=s3_client)

    logger.info("[%s] getting relations", prefix)
    get_relations_df(prefix)
    relations_path = prefix_cache_join(prefix, name="relations.tsv", version=get_version(prefix))
    if not relations_path.exists():
        raise FileNotFoundError
    relations_key = os.path.join(prefix, "cache", "relations.tsv")
    logger.info("[%s] uploading relations", prefix)
    upload_file(path=relations_path, bucket=bucket, key=relations_key, s3_client=s3_client)

    logger.info("[%s] getting properties", prefix)
    get_properties_df(prefix)
    properties_path = prefix_cache_join(prefix, name="properties.tsv", version=get_version(prefix))
    if not properties_path.exists():
        raise FileNotFoundError
    properties_key = os.path.join(prefix, "cache", "properties.tsv")
    logger.info("[%s] uploading properties", prefix)
    upload_file(path=properties_path, bucket=bucket, key=properties_key, s3_client=s3_client)

    logger.info("[%s] getting alternative identifiers", prefix)
    get_id_to_alts(prefix)
    alts_path = prefix_cache_join(prefix, name="alt_ids.tsv", version=get_version(prefix))
    if not alts_path.exists():
        raise FileNotFoundError
    alts_key = os.path.join(prefix, "cache", "alt_ids.tsv")
    logger.info("[%s] uploading alternative identifiers", prefix)
    upload_file(path=alts_path, bucket=bucket, key=alts_key)
Exemplo n.º 3
0
    def __init__(
        self,
        *,
        graph: BELGraph,
        managers: List,
    ):
        """Initialize the pathway assigner with several lookup dictionaries.

        :param managers: A ComPath manager or iterable of ComPath managers
        """
        self.graph = graph

        self.pathway_to_symbols = defaultdict(set)
        self.symbol_to_pathways = defaultdict(set)

        if not isinstance(managers, list):
            managers = []

        for manager in managers:
            self._add_manager(manager)

        # These won't be loaded more so convert to normal dicts
        self.pathway_to_symbols = dict(self.pathway_to_symbols)
        self.symbol_to_pathways = dict(self.symbol_to_pathways)

        hgnc_obo = pyobo.sources.hgnc.get_obo()
        self.hgnc_id_to_symbol = pyobo.get_id_name_mapping('hgnc')

        # Prepare MGI
        self.hgnc_mgi_mapping = hgnc_obo.get_relations_mapping('ro:HOM0000017', 'mgi')
        self.mgi_to_hgnc = {v: k for k, v in self.hgnc_mgi_mapping.items()}
        self.mgi_id_to_symbol = pyobo.get_id_name_mapping('mgi')
        self.mgi_symbol_to_hgnc_symbol = {
            self.mgi_id_to_symbol[mgi_id]: self.hgnc_id_to_symbol[hgnc_id]
            for mgi_id, hgnc_id in self.mgi_to_hgnc.items()
        }

        # Prepare RGD
        self.hgnc_rgd_mapping = hgnc_obo.get_relations_mapping('ro:HOM0000017', 'rgd')
        self.rgd_to_hgnc = {v: k for k, v in self.hgnc_rgd_mapping.items()}
        self.rgd_id_to_symbol = pyobo.get_id_name_mapping('rgd')
        self.rgd_symbol_to_hgnc_symbol = {
            self.rgd_id_to_symbol[rgd_id]: self.hgnc_id_to_symbol[hgnc_id]
            for rgd_id, hgnc_id in self.rgd_to_hgnc.items()
        }

        self.pathway_to_key = defaultdict(set)
        self.key_to_pathway = defaultdict(set)

        self.pmid_to_pathway = defaultdict(set)
        self.pathway_to_pmid = defaultdict(set)

        self.double_annotated = defaultdict(lambda: defaultdict(list))
Exemplo n.º 4
0
def get_gilda_terms(prefix: str,
                    url: Optional[str] = None) -> Iterable[gilda.term.Term]:
    """Get gilda terms for the given namespace."""
    id_to_name = get_id_name_mapping(prefix, url=url)
    for identifier, name in tqdm(id_to_name.items(), desc='mapping names'):
        yield gilda.term.Term(
            norm_text=normalize(name),
            text=name,
            db=prefix,
            id=identifier,
            entry_name=name,
            status='name',
            source=prefix,
        )

    id_to_synonyms = get_id_synonyms_mapping(prefix, url=url)
    for identifier, synonyms in tqdm(id_to_synonyms.items(),
                                     desc='mapping synonyms'):
        name = id_to_name[identifier]
        for synonym in synonyms:
            yield gilda.term.Term(
                norm_text=normalize(synonym),
                text=synonym,
                db=prefix,
                id=identifier,
                entry_name=name,
                status='synonym',
                source=prefix,
            )
Exemplo n.º 5
0
def main(
    port: str,
    host: str,
    sql: bool,
    sql_uri: str,
    sql_refs_table: str,
    sql_alts_table: str,
    data: Optional[str],
    test: bool,
    with_gunicorn: bool,
    lazy: bool,
    workers: int,
):
    """Run the resolver app."""
    if test and lazy:
        click.secho('Can not run in --test and --lazy mode at the same time', fg='red')
        sys.exit(0)

    if test:
        data = [
            (prefix, identifier, name)
            for prefix in ['hgnc', 'chebi', 'doid', 'go']
            for identifier, name in pyobo.get_id_name_mapping(prefix).items()
        ]
        data = pd.DataFrame(data, columns=['prefix', 'identifier', 'name'])

    app = get_app(
        data,
        lazy=lazy,
        sql=sql,
        uri=sql_uri,
        refs_table=sql_refs_table,
        alts_table=sql_alts_table,
    )
    run_app(app=app, host=host, port=port, with_gunicorn=with_gunicorn, workers=workers)
Exemplo n.º 6
0
def update_drugbank_mappings():
    """Update mappings from DrugBank to CHEBI/CHEMBL"""
    # Note that for this to work, PyOBO (https://github.com/pyobo/pyobo) has
    # to be installed and the DrugBank download
    # (https://www.drugbank.ca/releases/latest) put into ~/.obo/drugbank/
    # Note that the DrugBank download requires signing up for an account and
    # waiting for approval.
    import pyobo
    drugbank_chembl = pyobo.get_filtered_xrefs('drugbank', 'chembl.compound')
    drugbank_chebi = pyobo.get_filtered_xrefs('drugbank', 'chebi')
    chebi_drugbank = pyobo.get_filtered_xrefs('chebi', 'drugbank')
    drugbank_names = pyobo.get_id_name_mapping('drugbank')
    rows = []
    for drugbank_id, chembl_id in drugbank_chembl.items():
        rows.append([drugbank_id, 'CHEMBL', chembl_id, 'drugbank'])
    for drugbank_id, chebi_id in drugbank_chebi.items():
        rows.append([drugbank_id, 'CHEBI', chebi_id, 'drugbank'])
    for chebi_id, drugbank_id in chebi_drugbank.items():
        rows.append([drugbank_id, 'CHEBI', chebi_id, 'chebi'])
    for drugbank_id, name in drugbank_names.items():
        rows.append([drugbank_id, 'NAME', name, 'drugbank'])
    fname = os.path.join(path, 'drugbank_mappings.tsv')
    header = ['DRUGBANK_ID', 'NAMESPACE', 'ID', 'SOURCE']
    rows = [header] + sorted(rows)
    write_unicode_csv(fname, rows, delimiter='\t')
Exemplo n.º 7
0
 def test_get_names(self):
     """Test getting names."""
     id_to_name = get_id_name_mapping('chebi', url=TEST_CHEBI_OBO_PATH, local=True)
     for identifier in id_to_name:
         self.assertFalse(identifier.startswith('CHEBI'))
         self.assertFalse(identifier.startswith('CHEBI:'))
         self.assertFalse(identifier.startswith('chebi:'))
         self.assertFalse(identifier.startswith('chebi'))
Exemplo n.º 8
0
 def test_get_names(self):
     """Test getting names."""
     with chebi_patch:
         id_to_name = get_id_name_mapping('chebi')
     for identifier in id_to_name:
         self.assertFalse(identifier.startswith('CHEBI'))
         self.assertFalse(identifier.startswith('CHEBI:'))
         self.assertFalse(identifier.startswith('chebi:'))
         self.assertFalse(identifier.startswith('chebi'))
Exemplo n.º 9
0
def get_terms() -> Iterable[Term]:
    """Get ComplexPortal terms."""
    df = get_df()

    df['aliases'] = df['aliases'].map(lambda s: s.split('|')
                                      if pd.notna(s) else [])
    df['members'] = df['members'].map(_parse_members)
    df['xrefs'] = df['xrefs'].map(_parse_xrefs)

    taxnomy_id_to_name = get_id_name_mapping('ncbitaxon')
    df['taxonomy_name'] = df['taxonomy_id'].map(taxnomy_id_to_name.get)

    slim_df = df[[
        'complexportal_id',
        'name',
        'definition',
        'aliases',
        'xrefs',
        'taxonomy_id',
        'taxonomy_name',
        'members',
    ]]
    it = tqdm(slim_df.values,
              total=len(slim_df.index),
              desc=f'mapping {PREFIX}')
    unhandled_xref_type = set()
    for complexportal_id, name, definition, aliases, xrefs, taxonomy_id, taxonomy_name, members in it:
        synonyms = [Synonym(name=alias) for alias in aliases]
        _xrefs = []
        provenance = []
        for reference, note in xrefs:
            if note == 'identity':
                _xrefs.append(reference)
            elif note == 'see-also' and reference.prefix == 'pubmed':
                provenance.append(reference)
            elif (note, reference.prefix) not in unhandled_xref_type:
                logger.debug(
                    f'unhandled xref type: {note} / {reference.prefix}')
                unhandled_xref_type.add((note, reference.prefix))

        term = Term(
            reference=Reference(prefix=PREFIX,
                                identifier=complexportal_id,
                                name=name),
            definition=definition.strip(),
            synonyms=synonyms,
            xrefs=_xrefs,
            provenance=provenance,
        )
        term.set_species(identifier=taxonomy_id, name=taxonomy_name)

        for reference, _count in members:
            term.append_relationship(has_part, reference)

        yield term
Exemplo n.º 10
0
def main(port: int, host: str, data: Optional[str], test: bool, gunicorn: bool,
         lazy: bool):
    """Run the resolver app."""
    if test:
        data = [
            (prefix, identifier, name) for prefix in ['hgnc', 'chebi', 'doid']
            for identifier, name in pyobo.get_id_name_mapping(prefix).items()
        ]
        data = pd.DataFrame(data, columns=['prefix', 'identifier', 'name'])

    app = get_app(data, lazy=lazy)
    run_app(app=app, host=host, port=port, gunicorn=gunicorn)
Exemplo n.º 11
0
def get_gilda_terms(
        prefix: str,
        identifiers_are_names: bool = False) -> Iterable[gilda.term.Term]:
    """Get gilda terms for the given namespace."""
    id_to_name = get_id_name_mapping(prefix)
    it = tqdm(id_to_name.items(),
              desc=f"[{prefix}] mapping",
              unit_scale=True,
              unit="name")
    for identifier, name in it:
        yield gilda.term.Term(
            norm_text=normalize(name),
            text=name,
            db=prefix,
            id=identifier,
            entry_name=name,
            status="name",
            source=prefix,
        )

    id_to_synonyms = get_id_synonyms_mapping(prefix)
    it = tqdm(id_to_synonyms.items(),
              desc=f"[{prefix}] mapping",
              unit_scale=True,
              unit="synonym")
    for identifier, synonyms in it:
        name = id_to_name[identifier]
        for synonym in synonyms:
            yield gilda.term.Term(
                norm_text=normalize(synonym),
                text=synonym,
                db=prefix,
                id=identifier,
                entry_name=name,
                status="synonym",
                source=prefix,
            )

    if identifiers_are_names:
        it = tqdm(get_ids(prefix),
                  desc=f"[{prefix}] mapping",
                  unit_scale=True,
                  unit="id")
        for identifier in it:
            yield gilda.term.Term(
                norm_text=normalize(identifier),
                text=identifier,
                db=prefix,
                id=identifier,
                entry_name=None,
                status="identifier",
                source=prefix,
            )
Exemplo n.º 12
0
def iter_gilda_prediction_tuples(
    prefix: str,
    relation: str,
    *,
    grounder: Optional[Grounder] = None,
    identifiers_are_names: bool = False,
) -> Iterable[Tuple[str, str, str, str, str, str, str, str, float]]:
    """Iterate over prediction tuples for a given prefix."""
    if grounder is None:
        grounder = gilda.api.grounder
    id_name_mapping = get_id_name_mapping(prefix)
    it = tqdm(id_name_mapping.items(),
              desc=f"[{prefix}] gilda tuples",
              unit_scale=True,
              unit="name")
    for identifier, name in it:
        for scored_match in grounder.ground(name):
            target_prefix = scored_match.term.db.lower()
            yield (
                prefix,
                normalize_identifier(prefix, identifier),
                name,
                relation,
                target_prefix,
                normalize_identifier(target_prefix, scored_match.term.id),
                scored_match.term.entry_name,
                "lexical",
                scored_match.score,
            )

    if identifiers_are_names:
        it = tqdm(get_ids(prefix),
                  desc=f"[{prefix}] gilda tuples",
                  unit_scale=True,
                  unit="id")
        for identifier in it:
            for scored_match in grounder.ground(identifier):
                target_prefix = scored_match.term.db.lower()
                yield (
                    prefix,
                    normalize_identifier(prefix, identifier),
                    identifier,
                    relation,
                    target_prefix,
                    normalize_identifier(target_prefix, scored_match.term.id),
                    scored_match.term.entry_name,
                    "lexical",
                    scored_match.score,
                )
def main():
    """Run the MeSH curation pipeline."""
    xrefs_df = get_xrefs_df()
    mesh_xrefs_df = xrefs_df[xrefs_df['source_db'] == 'mesh']
    curated_mesh_ids = set(mesh_xrefs_df['source_id'])

    terms = {
        identifier: (name, suffix.strip('s'))
        for identifier, name in pyobo.get_id_name_mapping('mesh').items()
        if identifier not in curated_mesh_ids and identifier not in BLACKLIST
        for suffix in SUFFIXES if name.lower().endswith(suffix)
    }

    for i, (identifier, (name,
                         suffix)) in enumerate(sorted(terms.items(),
                                                      key=lambda t: t[1][0]),
                                               start=1):
        print('mesh', identifier, name, suffix, '?', '?', '?', '?', sep='\t')
Exemplo n.º 14
0
def get_all_enzymes():
    HOME = str(Path.home())
    ec_code_path = '.obo/ec-code/ec-code.obo'
    if not os.path.exists(os.path.join(HOME, ec_code_path)):
        _ = pyobo.get_id_name_mapping('ec-code')
        obo = obonet.read_obo(os.path.join(HOME, ec_code_path))
    else:
        obo = obonet.read_obo(os.path.join(HOME, ec_code_path))
    up_nodes = set()
    for node in obo.nodes:
        if node.startswith('uniprot'):
            up_nodes.add(node[8:])
    human_ups = {u for u in up_nodes if uniprot_client.is_human(u)}
    enzymes = {uniprot_client.get_gene_name(u) for u in human_ups}
    enzymes = {g for g in enzymes if not hgnc_client.is_kinase(g)}
    enzymes = {g for g in enzymes if not hgnc_client.is_phosphatase(g)}
    logger.info(f'Filtered {len(enzymes)} enzymes in total')
    return enzymes
def main(show_ungrounded: bool, output: Optional[TextIO]):
    """Run the MeSH curation pipeline."""
    xrefs_df = get_xrefs_df()
    mesh_xrefs_df = xrefs_df[xrefs_df['source_db'] == 'mesh']
    curated_mesh_ids = set(mesh_xrefs_df['source_id'])

    terms = {
        identifier: (name, name[:-len(suffix)], suffix.strip('s'))
        for identifier, name in pyobo.get_id_name_mapping('mesh').items()
        if identifier not in curated_mesh_ids and identifier not in BLACKLIST
        for suffix in SUFFIXES if name.lower().endswith(suffix)
    }

    it = sorted(terms.items(), key=lambda t: t[1][0])
    it = tqdm(it, desc='making MeSH curation sheet')
    for i, (identifier, (name, search_text, suffix)) in enumerate(it, start=1):
        for row in yield_gilda('mesh', identifier, name, suffix, search_text,
                               show_ungrounded or output is not None):
            print(*row, sep='\t', file=output)
Exemplo n.º 16
0
def _get_example(prefix: str) -> Optional[str]:
    if prefix in {'gaz', 'bila', 'pubchem.compound'}:
        return None
    if prefix in pyobo.getters.SKIP:
        return None
    try:
        x = pyobo.get_id_name_mapping(prefix)
    except (pyobo.getters.NoBuild, ValueError, urllib.error.URLError):
        return None
    if not x:
        return None
    x = list(x)
    try:
        rv = x[random.randint(0, len(x))]  # noqa:S311
    except IndexError:
        print('failed', prefix, x)
        return None
    else:
        print('adding', prefix, rv)
        return rv
Exemplo n.º 17
0
def iter_gilda_prediction_tuples(prefix: str,
                                 relation: str) -> Iterable[PredictionTuple]:
    """Iterate over prediction tuples for a given prefix."""
    provenance = get_script_url(__file__)
    id_name_mapping = pyobo.get_id_name_mapping(prefix)
    for identifier, name in tqdm(id_name_mapping.items(),
                                 desc=f'Mapping {prefix}'):
        for scored_match in gilda.ground(name):
            yield PredictionTuple(
                prefix,
                identifier,
                name,
                relation,
                scored_match.term.db.lower(),
                scored_match.term.id,
                scored_match.term.entry_name,
                'lexical',
                scored_match.score,
                provenance,
            )
BIOLOGICAL_ROLE_ID = '24432'
APPLICATION_ROLE_ID = '33232'
BIOCHEMICAL_ROLE_CHEBI_ID = '52206'
PATHWAY_INHIBITOR_CHEBI_ID = '76932'
ENZYME_INHIBITOR_CHEBI_ID = '23924'
AGONIST_CHEBI_ID = '48705'
INVERSE_AGONIST_CHEBI_ID = '90847'
INHIBITOR_CHEBI_ID = '35222'
ANTAGONIST_CHEBI_ID = '48706'
BLACKLIST = [
    '48001',  # protein synthesis inhibitor
    '64106',  # protein kinase agonist
]

chebi_obo = pyobo.get('chebi')
chebi_id_to_name = pyobo.get_id_name_mapping('chebi')

XREFS_DF = get_xrefs_df()
CURATED_ROLE_CHEBI_IDS = {
    source_id[len('CHEBI:'):]
    for source_db, source_id in XREFS_DF[['source_db', 'source_id']].values
    if source_db == 'chebi'
}
IRRELEVANT_ROLE_CHEBI_IDS = set(
    itt.chain.from_iterable(
        chebi_obo.descendants(chebi_id[len('CHEBI'):])
        for chebi_id in get_irrelevant_roles_df().identifier
        if chebi_id[len('CHEBI'):] in chebi_obo.hierarchy))


def _get_inhibitors_reclassification() -> pd.DataFrame:
Exemplo n.º 19
0
    def populate(self, paths: Optional[Mapping[str, str]] = None):
        """Populate the database.

        :param paths: mapping from tax identifiers to paths to GMT files
        """
        if not paths:
            logger.info('No paths given.')
            paths = {info.taxonomy_id: info.path for info in infos.values()}
            logger.info(f'Using default paths at {paths}.')
        elif not isinstance(paths, dict):
            raise TypeError('Invalid type for paths. Shoudl be dict.')

        pathways = [
            pathway
            for taxonomy_id, path in paths.items()
            for pathway in parse_wikipathways_gmt(path)
        ]

        versions = {
            version
            for _identifier, version, _revision, _name, _species_name, _entries in pathways
        }
        if len(versions) != 1:
            raise ValueError('got multiple versions')
        version = list(versions)[0]

        taxonomy_name_to_id = get_name_id_mapping('ncbitaxon')
        species_names = {
            SPECIES_REMAPPING.get(species_name, species_name)
            for _identifier, _version, _revision, _name, species_name, _entries in pathways
        }
        species_name_to_species = {}
        for species_name in tqdm(species_names, desc=f'v{version} serializing species'):
            taxonomy_id = taxonomy_name_to_id[species_name]
            species = species_name_to_species[species_name] = Species(taxonomy_id=taxonomy_id, name=species_name)
            self.session.add(species)

        hgnc_id_to_entrez_id = get_filtered_xrefs('hgnc', 'ncbigene')
        if not hgnc_id_to_entrez_id:
            raise ValueError('Mappings from hgnc to ncbigene couldnt be loaded')

        entrez_id_to_hgnc_id = {v: k for k, v in hgnc_id_to_entrez_id.items()}
        hgnc_id_to_name = get_id_name_mapping('hgnc')

        missing_entrez_ids = set()
        entrez_ids = {
            entrez_id
            for _identifier, _version, _revision, _name, _species, entrez_ids in pathways
            for entrez_id in entrez_ids
        }
        entrez_id_protein = {}
        for entrez_id in tqdm(entrez_ids, desc=f'v{version} serializing proteins'):
            hgnc_id = entrez_id_to_hgnc_id.get(entrez_id)
            if hgnc_id:
                hgnc_symbol = hgnc_id_to_name[hgnc_id]
            else:
                hgnc_symbol = None

            if not hgnc_symbol:
                logging.debug(f"ncbigene:{entrez_id} has no HGNC identifier")
                missing_entrez_ids.add(entrez_id)

            entrez_id_protein[entrez_id] = protein = self.get_or_create_protein(
                entrez_id=entrez_id,
                hgnc_symbol=hgnc_symbol,
                hgnc_id=hgnc_id,
            )
            self.session.add(protein)

        logger.info(f'Proteins: {len(entrez_id_protein)}')
        logger.info(f"Proteins w/o HGNC mapping: {len(missing_entrez_ids)}")

        for (
            wikipathways_id, _version, revision,
            pathway_name, species_name, entrez_ids,
        ) in tqdm(pathways, desc=f'v{version} serializing pathways'):
            proteins = [
                entrez_id_protein[entrez_id]
                for entrez_id in entrez_ids
            ]

            pathway = self.get_or_create_pathway(
                identifier=wikipathways_id,
                name=pathway_name.strip(),
                revision=revision,
                species=species_name_to_species[SPECIES_REMAPPING.get(species_name, species_name)],
                proteins=proteins,
            )
            self.session.add(pathway)

        self.session.commit()
Exemplo n.º 20
0
def _get_urls(prefix='doid', host='localhost', port=5000):
    identifiers = pyobo.get_id_name_mapping(prefix)
    return [
        f'http://{host}:{port}/resolve/{prefix}:{identifier}'
        for identifier in identifiers
    ]
Exemplo n.º 21
0
from tqdm import tqdm

import pybel
import pybel.dsl
from pybel import BELGraph
from ..compath import CompathManager, CompathPathwayMixin, CompathProteinMixin
from ..utils import get_data_dir

logger = logging.getLogger(__name__)

MODULE_NAME = 'pid'
DIRECTORY = get_data_dir(MODULE_NAME)

URL = 'https://github.com/NCIP/pathway-interaction-database/raw/master/download/NCI-Pathway-Info.xlsx'

chebi_id_to_name = get_id_name_mapping('chebi')
hgnc_name_to_id = get_name_id_mapping('hgnc')
hgnc_id_to_entrez_id = get_filtered_xrefs('hgnc', 'ncbigene')

relation_to_adder = {
    'controls-state-change-of': BELGraph.add_regulates,
}

namespace_to_dsl = {
    'cas': pybel.dsl.Abundance,
    'uniprot': pybel.dsl.Protein,
    'hprd': pybel.dsl.Protein,
    'chebi': pybel.dsl.Abundance,
    'hgnc': pybel.dsl.Protein,
}
Exemplo n.º 22
0
def get_terms(version: str) -> Iterable[Term]:
    """Get ComplexPortal terms."""
    df = get_df(version=version)
    df.rename(
        inplace=True,
        columns={
            "Aliases for complex": "aliases",
            "Identifiers (and stoichiometry) of molecules in complex": "members",
            "Taxonomy identifier": "taxonomy_id",
            "Cross references": "xrefs",
            "Description": "definition",
            "Recommended name": "name",
            "#Complex ac": "complexportal_id",
        },
    )

    df["aliases"] = df["aliases"].map(lambda s: s.split("|") if pd.notna(s) else [])
    df["members"] = df["members"].map(_parse_members)
    df["xrefs"] = df["xrefs"].map(_parse_xrefs)

    taxnomy_id_to_name = get_id_name_mapping("ncbitaxon")
    df["taxonomy_name"] = df["taxonomy_id"].map(taxnomy_id_to_name.get)

    slim_df = df[
        [
            "complexportal_id",
            "name",
            "definition",
            "aliases",
            "xrefs",
            "taxonomy_id",
            "taxonomy_name",
            "members",
        ]
    ]
    it = tqdm(slim_df.values, total=len(slim_df.index), desc=f"mapping {PREFIX}")
    unhandled_xref_type = set()
    for (
        complexportal_id,
        name,
        definition,
        aliases,
        xrefs,
        taxonomy_id,
        taxonomy_name,
        members,
    ) in it:
        synonyms = [Synonym(name=alias) for alias in aliases]
        _xrefs = []
        provenance = []
        for reference, note in xrefs:
            if note == "identity":
                _xrefs.append(reference)
            elif note == "see-also" and reference.prefix == "pubmed":
                provenance.append(reference)
            elif (note, reference.prefix) not in unhandled_xref_type:
                logger.debug(f"unhandled xref type: {note} / {reference.prefix}")
                unhandled_xref_type.add((note, reference.prefix))

        term = Term(
            reference=Reference(prefix=PREFIX, identifier=complexportal_id, name=name),
            definition=definition.strip() if pd.notna(definition) else None,
            synonyms=synonyms,
            xrefs=_xrefs,
            provenance=provenance,
        )
        term.set_species(identifier=taxonomy_id, name=taxonomy_name)

        for reference, _count in members:
            term.append_relationship(has_part, reference)

        yield term
Exemplo n.º 23
0
def get_relations_df() -> pd.DataFrame:
    """Assemble the relations dataframe."""
    xrefs_df = get_xrefs_df()

    logger.info('loading famplex mapping')
    famplex_id_to_members = defaultdict(list)
    famplex_relations_df = pd.read_csv(FAMPLEX_RELATIONS_URL)
    for source_id, source_name, rel, target_db, target_name in famplex_relations_df.values:
        if source_id.lower() == 'hgnc' and rel == 'isa' and target_db.lower() == 'fplx':
            try:
                hgnc_id = hgnc_name_to_id[source_name]
            except KeyError:
                logger.warning(f'Could not find {source_name} for fplx:{target_name}')
                continue
            famplex_id_to_members[target_name].append((hgnc_id, source_name))

    logger.info('getting enzyme classes')
    expasy_graph, ec_code_to_children = get_expasy_closure()
    logger.info('getting ec2go')
    ec2go = get_ec2go()

    logger.info('inferring over target hierarchies')
    x = defaultdict(list)
    for source_db, source_id, _, modulation, target_type, target_db, target_id, target_name in xrefs_df.values:
        if source_db != 'chebi':
            continue

        if target_db == 'hgnc':
            # Append original
            x[source_db, source_id].append((modulation, 'protein', 'hgnc', target_id, target_name))
            # Append inferred
            for uniprot_id, uniprot_name in get_uniprot_id_names(target_id):
                x[source_db, source_id].append((modulation, 'protein', 'uniprot', uniprot_id, uniprot_name))

        elif target_db == 'fplx':
            # Append original
            x[source_db, source_id].append((modulation, target_type, target_db, target_id, target_name))
            # Append inferred
            for hgnc_id, hgnc_symbol in famplex_id_to_members.get(target_id, []):
                x[source_db, source_id].append((modulation, 'protein', 'hgnc', hgnc_id, hgnc_symbol))
                for uniprot_id, uniprot_name in get_uniprot_id_names(hgnc_id):
                    x[source_db, source_id].append((modulation, 'protein', 'uniprot', uniprot_id, uniprot_name))

        elif target_db == 'ec-code':
            children_ec_codes = ec_code_to_children.get(target_id)
            if children_ec_codes is None:
                # this is the case for about 15 entries
                logger.info(f'could not find children of {target_db}:{target_id}')
                continue

            for sub_target_db, sub_target_id, sub_target_name in children_ec_codes:
                target_type = DB_TO_TYPE[sub_target_db]
                x[source_db, source_id].append((
                    modulation, target_type, sub_target_db, sub_target_id, sub_target_name,
                ))

            for go_id, go_name in ec2go.get(target_id, []):
                x[source_db, source_id].append((
                    modulation, 'molecular function', 'go', go_id, go_name,
                ))

        else:
            x[source_db, source_id].append((modulation, target_type, target_db, target_id, target_name))

    logger.info('inferring over role hiearchies')
    db_to_role_to_chemical_curies = {
        'chebi': get_chebi_role_to_children(),
    }
    db_to_id_mapping = {
        'chebi': get_id_name_mapping('chebi'),
    }
    #: A set of databases to remove the prefix from
    remove_prefix = {'chebi'}

    rows = []
    for (role_db, role_id), entries in x.items():
        if role_db in remove_prefix and role_id.lower().startswith(f'{role_db}:'.lower()):
            role_id = role_id[len(f'{role_db}:'):]

        # TODO map role_db, role_id to set of sub_role_db, sub_role_id
        sub_role_curies = {(role_db, role_id)}

        for modulation, target_type, target_db, target_id, target_name in entries:
            chemical_curies = set(itt.chain.from_iterable(
                db_to_role_to_chemical_curies[sub_role_db].get(sub_role_id, [])
                for sub_role_db, sub_role_id in sub_role_curies
            ))
            if not chemical_curies:
                logger.debug('no inference for %s:%s', role_db, role_id)
                continue
            for chemical_db, chemical_id in chemical_curies:
                rows.append((
                    chemical_db, chemical_id, db_to_id_mapping[chemical_db][chemical_id],
                    modulation, target_type, target_db, target_id, target_name,
                ))
    return pd.DataFrame(rows, columns=XREFS_COLUMNS)
Exemplo n.º 24
0
def main(
    port: str,
    host: str,
    sql: bool,
    sql_uri: str,
    sql_refs_table: str,
    sql_alts_table: str,
    sql_defs_table: str,
    name_data: Optional[str],
    alts_data: Optional[str],
    defs_data: Optional[str],
    test: bool,
    with_gunicorn: bool,
    lazy: bool,
    workers: int,
):
    """Run the resolver app."""
    if test and lazy:
        click.secho('Can not run in --test and --lazy mode at the same time', fg='red')
        sys.exit(0)

    from .resolver import get_app

    if test:
        from pyobo import get_id_name_mapping, get_alts_to_id, get_id_definition_mapping
        import pandas as pd
        prefixes = ['hgnc', 'chebi', 'doid', 'go']
        name_data = pd.DataFrame(
            [
                (prefix, identifier, name)
                for prefix in prefixes
                for identifier, name in get_id_name_mapping(prefix).items()
            ],
            columns=['prefix', 'identifier', 'name'],
        )
        alts_data = pd.DataFrame(
            [
                (prefix, alt, identifier)
                for prefix in prefixes
                for alt, identifier in get_alts_to_id(prefix).items()
            ],
            columns=['prefix', 'alt', 'identifier'],
        )
        defs_data = pd.DataFrame(
            [
                (prefix, identifier, definition)
                for prefix in prefixes
                for identifier, definition in get_id_definition_mapping(prefix).items()
            ],
            columns=['prefix', 'identifier', 'definition'],
        )

    app = get_app(
        name_data=name_data,
        alts_data=alts_data,
        defs_data=defs_data,
        lazy=lazy,
        sql=sql,
        uri=sql_uri,
        refs_table=sql_refs_table,
        alts_table=sql_alts_table,
        defs_table=sql_defs_table,
    )
    run_app(app=app, host=host, port=port, with_gunicorn=with_gunicorn, workers=workers)