Exemplo n.º 1
0
def get_terms(force: bool = False) -> Iterable[Term]:
    """Get terms."""
    # DDB ID	DDB_G ID	Name	UniProt ID
    uniprot_mappings = multisetdict(
        ensure_df(PREFIX, url=URL, force=force, name="uniprot_mappings.tsv", usecols=[1, 3]).values
    )

    terms = ensure_df(PREFIX, url=URL, force=force, name="gene_info.tsv")
    # GENE ID (DDB_G ID)	Gene Name	Synonyms	Gene products
    for identifier, name, synonyms, products in tqdm(terms.values):
        term = Term.from_triple(
            prefix=PREFIX,
            identifier=identifier,
            name=name,
        )
        if products and pd.notna(products) and products != "unknown":
            for synonym in products.split(","):
                term.append_synonym(synonym.strip())
        if synonyms and pd.notna(synonyms):
            for synonym in synonyms.split(","):
                term.append_synonym(Synonym(synonym.strip()))
        for uniprot_id in uniprot_mappings.get(identifier, []):
            if not uniprot_id or pd.isna(uniprot_id) or uniprot_id not in {"unknown", "pseudogene"}:
                continue
            term.append_relationship(has_gene_product, Reference.auto("uniprot", uniprot_id))

        term.set_species(identifier="44689", name="Dictyostelium discoideum")
        yield term
Exemplo n.º 2
0
def _get_xref_df(version: str) -> Mapping[str, List[Reference]]:
    base_url = f"https://raw.githubusercontent.com/sorgerlab/famplex/{version}"
    xrefs_url = f"{base_url}/equivalences.csv"
    xrefs_df = ensure_df(PREFIX,
                         url=xrefs_url,
                         version=version,
                         header=None,
                         sep=",",
                         dtype=str)

    # Normalize nextprot families
    ns_remapping = {
        "NXP": "nextprot.family",
    }
    xrefs_df[0] = xrefs_df[0].map(lambda s: ns_remapping.get(s, s))
    xrefs_df[1] = [
        xref_identifier
        if xref_prefix != "nextprot.family" else xref_identifier[len("FA:"):]
        for xref_prefix, xref_identifier in xrefs_df[[0, 1]].values
    ]

    xrefs_df[0] = xrefs_df[0].map(normalize_prefix)
    xrefs_df = xrefs_df[xrefs_df[0].notna()]
    xrefs_df = xrefs_df[xrefs_df[0] != "bel"]
    return multidict(
        (identifier, Reference(xref_prefix, xref_identifier))
        for xref_prefix, xref_identifier, identifier in xrefs_df.values)
Exemplo n.º 3
0
def get_df() -> pd.DataFrame:
    """Get the BioGRID identifiers mapping dataframe."""
    version = bioversions.get_version('biogrid')
    url = f'{BASE_URL}/BIOGRID-{version}/BIOGRID-IDENTIFIERS-{version}.tab.zip'
    df = ensure_df(PREFIX, url=url, skiprows=28, dtype=str, version=version)
    df['taxonomy_id'] = df['ORGANISM_OFFICIAL_NAME'].map(_lookup)
    return df
Exemplo n.º 4
0
def iter_terms(force: bool = False) -> Iterable[Term]:
    """Iterate over selventa chemical terms."""
    df = ensure_df(PREFIX, url=URL, skiprows=8, force=force)
    for identifier, label, xrefs in df[["ID", "LABEL", "XREF"]].values:
        term = Term.from_triple(PREFIX, identifier, label)
        for xref in xrefs.split("|") if pd.notna(xrefs) else []:
            term.append_xref(xref)
        yield term
Exemplo n.º 5
0
def get_terms(version: str, force: bool = False) -> Iterable[Term]:
    """Get terms."""
    orthologs_df = ensure_df(PREFIX,
                             url=ORTHOLOGS_URL,
                             force=force,
                             header=None,
                             version=version)
    identifier_to_hgnc_ids = defaultdict(set)
    hgnc_symbol_to_id = pyobo.get_name_id_mapping("hgnc")
    for identifier, hgnc_symbols in orthologs_df.values:
        if hgnc_symbols == "NONE":
            continue
        for hgnc_symbol in hgnc_symbols.split("|"):
            hgnc_id = hgnc_symbol_to_id.get(hgnc_symbol)
            if hgnc_id is not None:
                identifier_to_hgnc_ids[identifier].add(hgnc_id)

    df = ensure_df(PREFIX, url=URL, force=force, header=None, version=version)
    so = {
        gtype: Reference.auto("SO", POMBASE_TO_SO[gtype])
        for gtype in sorted(df[df.columns[6]].unique())
    }
    for _, reference in sorted(so.items()):
        yield Term(reference=reference)
    for identifier, _, symbol, chromosome, name, uniprot_id, gtype, synonyms in tqdm(
            df.values):
        term = Term.from_triple(
            prefix=PREFIX,
            identifier=identifier,
            name=symbol if pd.notna(symbol) else None,
            definition=name if pd.notna(name) else None,
        )
        term.append_property("chromosome", chromosome[len("chromosome_"):])
        term.append_parent(so[gtype])
        term.set_species(identifier="4896", name="Schizosaccharomyces pombe")
        for hgnc_id in identifier_to_hgnc_ids.get(identifier, []):
            term.append_relationship(orthologous,
                                     Reference.auto("hgnc", hgnc_id))
        if uniprot_id and pd.notna(uniprot_id):
            term.append_relationship(has_gene_product,
                                     Reference.auto("uniprot", uniprot_id))
        if synonyms and pd.notna(synonyms):
            for synonym in synonyms.split(","):
                term.append_synonym(Synonym(synonym))
        yield term
Exemplo n.º 6
0
def ensure_list_pathways(version: str) -> Mapping[str, str]:
    """Ensure the KEGG Map (non species specific)."""
    rv = ensure_df(
        KEGG_PATHWAY_PREFIX,
        url=f"{BASE}/list/pathway",
        name="pathway.tsv",
        version=version,
    )
    return {k[len("path:"):]: v for k, v in rv.values}
Exemplo n.º 7
0
def _get_synonyms(version, force):
    url = f"http://ftp.flybase.net/releases/FB{version}/precomputed_files/synonyms/fb_synonym_fb_{version}.tsv.gz"
    df = ensure_df(PREFIX,
                   url=url,
                   force=force,
                   version=version,
                   skiprows=4,
                   usecols=[0, 2])
    return df  # TODO use this
Exemplo n.º 8
0
def get_premature_to_prefamily_df(version: str) -> pd.DataFrame:
    """Get premature miRNA to premature family dataframe."""
    url = f'ftp://mirbase.org/pub/mirbase/{version}/database_files/mirna_2_prefam.txt.gz'
    return ensure_df(
        PREFIX,
        url=url,
        version=version,
        names=['premature_key', 'prefamily_key'],
        dtype=str,
    )
Exemplo n.º 9
0
def _get_definitions(version: str, force: bool = False) -> Mapping[str, str]:
    url = f"http://ftp.flybase.net/releases/FB{version}/precomputed_files/genes/automated_gene_summaries.tsv.gz"
    df = ensure_df(PREFIX,
                   url=url,
                   force=force,
                   version=version,
                   skiprows=2,
                   header=None,
                   usecols=[0, 1])
    return dict(df.values)
Exemplo n.º 10
0
def iter_terms(force: bool = False) -> Iterable[Term]:
    """Iterate over selventa disease terms."""
    df = ensure_df(PREFIX, url=URL, skiprows=9, force=force)

    for identifier, label, synonyms, xrefs in df[["ID", "LABEL", "SYNONYMS", "XREF"]].values:
        term = Term.from_triple(PREFIX, identifier, label)
        for synonym in synonyms.split("|") if pd.notna(synonyms) else []:
            term.append_synonym(synonym)
        for xref in xrefs.split("|") if pd.notna(xrefs) else []:
            term.append_xref(xref)
        yield term
Exemplo n.º 11
0
def get_premature_family_df(version: str) -> pd.DataFrame:
    """Get premature family dataframe."""
    url = f"ftp://mirbase.org/pub/mirbase/{version}/database_files/mirna_prefam.txt.gz"
    return ensure_df(
        PREFIX,
        url=url,
        version=version,
        names=["prefamily_key", "family_id", "family_name"],
        usecols=[0, 1, 2],
        index_col=0,
        dtype=str,
    )
Exemplo n.º 12
0
def get_premature_to_prefamily_df(version: str,
                                  force: bool = False) -> pd.DataFrame:
    """Get premature miRNA to premature family dataframe."""
    url = f"https://mirbase.org/ftp/{version}/database_files/mirna_2_prefam.txt.gz"
    return ensure_df(
        PREFIX,
        url=url,
        version=version,
        names=["premature_key", "prefamily_key"],
        dtype=str,
        force=force,
    )
Exemplo n.º 13
0
def get_premature_df(version: str, force: bool = False) -> pd.DataFrame:
    """Get premature miRNA dataframe."""
    url = f"https://mirbase.org/ftp/{version}/database_files/mirna.txt.gz"
    return ensure_df(
        PREFIX,
        url=url,
        version=version,
        names=["premature_key", "mirbase_id", "mirna_name"],
        usecols=[0, 1, 2],
        dtype=str,
        force=force,
    )
Exemplo n.º 14
0
def get_premature_df(version: str) -> pd.DataFrame:
    """Get premature miRNA dataframe."""
    url = f'ftp://mirbase.org/pub/mirbase/{version}/database_files/mirna.txt.gz'
    return ensure_df(
        PREFIX,
        url=url,
        version=version,
        names=['premature_key', 'mirbase_id', 'mirna_name'],
        usecols=[0, 1, 2],
        index_col=0,
        dtype=str,
    )
Exemplo n.º 15
0
def _get_names(version: str, force: bool = False) -> pd.DataFrame:
    url = f"{BASE_URL}/FB{version}/precomputed_files/genes/fbgn_fbtr_fbpp_expanded_fb_{version}.tsv.gz"
    df = ensure_df(
        PREFIX,
        url=url,
        force=force,
        version=version,
        skiprows=4,
        usecols=[0, 1, 2, 3, 4],
        skipfooter=1,
    )
    return df
Exemplo n.º 16
0
def _get_organisms(version: str, force: bool = False) -> Mapping[str, str]:
    """Get mapping from abbreviation column to NCBI taxonomy ID column."""
    url = f"http://ftp.flybase.net/releases/FB{version}/precomputed_files/species/organism_list_fb_{version}.tsv.gz"
    df = ensure_df(PREFIX,
                   url=url,
                   force=force,
                   version=version,
                   skiprows=4,
                   header=None,
                   usecols=[2, 4])
    df.dropna(inplace=True)
    return dict(df.values)
Exemplo n.º 17
0
def get_chembl_compound_equivalences_raw(usecols=None,
                                         version: Optional[str] = None
                                         ) -> pd.DataFrame:
    """Get the chemical representations raw dataframe."""
    if version is None:
        version = bioversions.get_version('chembl')

    base_url = f'ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_{version}'
    url = f'{base_url}/chembl_{version}_chemreps.txt.gz'
    return ensure_df(CHEMBL_COMPOUND_PREFIX,
                     url=url,
                     sep='\t',
                     usecols=usecols)
Exemplo n.º 18
0
def get_chunks(force: bool = False) -> pd.DataFrame:
    """Get the BioGRID identifiers mapping dataframe."""
    version = bioversions.get_version(PREFIX)
    df = ensure_df(
        PREFIX,
        url=URL,
        name="results.csv",
        force=force,
        version=version,
        sep=",",
        chunksize=CHUNKSIZE,
        usecols=[0, 1, 2, 3, 5],
    )
    return df
Exemplo n.º 19
0
def get_df(version: str) -> pd.DataFrame:
    """Get a combine ComplexPortal dataframe."""
    url_base = f'ftp://ftp.ebi.ac.uk/pub/databases/intact/complex/{version}/complextab'
    dfs = [
        ensure_df(
            PREFIX,
            url=f'{url_base}/{ncbitaxonomy_id}.tsv',
            version=version,
            na_values={'-'},
            header=0,
            dtype=str,
        ) for ncbitaxonomy_id in SPECIES
    ]
    return pd.concat(dfs)
Exemplo n.º 20
0
def _get_human_orthologs(version: str,
                         force: bool = False) -> Mapping[str, Set[str]]:
    url = (f"http://ftp.flybase.net/releases/FB{version}/precomputed_files/"
           f"orthologs/dmel_human_orthologs_disease_fb_{version}.tsv.gz")
    df = ensure_df(
        PREFIX,
        url=url,
        force=force,
        version=version,
        skiprows=2,
        header=None,
        usecols=[0, 2],
        names=["flybase_id", "hgnc_id"],
    )
    return multisetdict(df.values)
Exemplo n.º 21
0
def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
    """Iterate over DrugCentral terms."""
    url = f"https://unmtid-shinyapps.net/download/DrugCentral/{version}/structures.smiles.tsv"
    df = ensure_df(PREFIX, url=url, version=version, force=force)
    for smiles, inchi, inchi_key, drugcentral_id, drugcentral_name, cas in df.values:
        if pd.isna(smiles) or pd.isna(inchi) or pd.isna(inchi_key):
            logger.warning("missing data for drugcentral:%s", drugcentral_id)
            continue
        term = Term.from_triple(prefix=PREFIX,
                                identifier=drugcentral_id,
                                name=drugcentral_name)
        term.append_xref(Reference(prefix="inchikey", identifier=inchi_key))
        term.append_property("smiles", smiles)
        term.append_property("inchi", inchi)
        if pd.notna(cas):
            term.append_xref(Reference(prefix="cas", identifier=cas))
        yield term
Exemplo n.º 22
0
def get_mature_df(version: str) -> pd.DataFrame:
    """Get mature miRNA dataframe."""
    url = f"ftp://mirbase.org/pub/mirbase/{version}/database_files/mirna_mature.txt.gz"
    return ensure_df(
        PREFIX,
        url=url,
        version=version,
        names=[
            "mature_key",
            "name",
            "previous",
            "mirbase.mature_id",
        ],
        usecols=[0, 1, 2, 3],
        index_col=0,
        dtype=str,
    )
Exemplo n.º 23
0
def iter_terms(force: bool = False) -> Iterable[Term]:
    """Iterate over selventa family terms."""
    df = ensure_df(PREFIX, url=URL, skiprows=9, force=force)

    terms = {}
    for identifier, label, synonyms in df[["ID", "LABEL", "SYNONYMS"]].values:
        term = Term.from_triple(PREFIX, identifier, label)
        for synonym in synonyms.split("|") if pd.notna(synonyms) else []:
            term.append_synonym(synonym)
        terms[identifier] = term

    df.PARENTS = df.PARENTS.map(lambda x: x[len("SFAM:"):], na_action="ignore")
    for child, parent in df.loc[df.PARENTS.notna(), ["ID", "PARENTS"]].values:
        if child == parent:
            continue  # wow...
        terms[child].append_parent(terms[parent])

    yield from terms.values()
Exemplo n.º 24
0
def get_chembl_protein_equivalences(
        version: Optional[str] = None) -> pd.DataFrame:
    """Get ChEMBL protein equivalences."""
    if version is None:
        version = bioversions.get_version('chembl')

    url = f'ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_{version}/chembl_uniprot_mapping.txt'
    df = ensure_df(
        CHEMBL_TARGET_PREFIX,
        url=url,
        sep='\t',
        usecols=[0, 1],
        names=[TARGET_ID, SOURCE_ID],  # switch around
    )
    df.loc[:, SOURCE_PREFIX] = 'chembl.target'
    df.loc[:, TARGET_PREFIX] = 'uniprot'
    df.loc[:, PROVENANCE] = f'chembl{version}'
    df = df[XREF_COLUMNS]
    return df
Exemplo n.º 25
0
def iter_terms(version: str) -> Iterable[Term]:
    """Iterate over DrugCentral terms."""
    df = ensure_df(PREFIX, url=URL, version=version)
    for smiles, inchi, inchi_key, drugcentral_id, drugcentral_name, cas in df.values:
        if pd.isna(smiles) or pd.isna(inchi) or pd.isna(inchi_key):
            logger.warning("missing data for drugcentral:%s", drugcentral_id)
            continue
        xrefs = [
            Reference(prefix="smiles", identifier=smiles),
            Reference(prefix="inchi", identifier=inchi),
            Reference(prefix="inchikey", identifier=inchi_key),
        ]

        if pd.notna(cas):
            xrefs.append(Reference(prefix="cas", identifier=cas))

        yield Term(
            reference=Reference(prefix=PREFIX, identifier=drugcentral_id, name=drugcentral_name),
            xrefs=xrefs,
        )
Exemplo n.º 26
0
def get_terms(force: bool = False) -> Iterable[Term]:
    """Get CGNC terms."""
    df = ensure_df(PREFIX,
                   url=URL,
                   name=f"{PREFIX}.tsv",
                   force=force,
                   header=0,
                   names=HEADER)
    for i, (cgnc_id, entrez_id, ensembl_id, name, synonym_1, synoynm_2, _,
            _) in enumerate(df.values):
        if pd.isna(cgnc_id):
            logger.warning(f"row {i} CGNC ID is none")
            continue

        try:
            int(cgnc_id)
        except ValueError:
            logger.warning(f"row {i} CGNC ID is not int-like: {cgnc_id}")
            continue

        term = Term.from_triple(
            prefix=PREFIX,
            identifier=cgnc_id,
            name=name,
        )
        term.set_species(identifier="9031", name="Gallus gallus")
        if entrez_id and pd.notna(entrez_id):
            term.append_xref(Reference(prefix="ncbigene",
                                       identifier=entrez_id))
        if pd.notna(ensembl_id):
            term.append_xref(Reference(prefix="ensembl",
                                       identifier=ensembl_id))
        if synonym_1 and pd.notna(synonym_1):
            term.append_synonym(synonym_1)
        if synoynm_2 and pd.notna(synoynm_2):
            term.append_synonym(synoynm_2)
        yield term
Exemplo n.º 27
0
def get_terms(force: bool = False,
              version: Optional[str] = None) -> Iterable[Term]:
    """Get terms."""
    alt_ids_df = ensure_df(
        PREFIX,
        url=ALTS_URL,
        name="alts.tsv",
        force=force,
        header=None,
        names=["alt", "zfin_id"],
        version=version,
    )
    primary_to_alt_ids = defaultdict(set)
    for alt_id, zfin_id in alt_ids_df.values:
        primary_to_alt_ids[zfin_id].add(alt_id)

    human_orthologs = multisetdict(
        ensure_df(PREFIX,
                  url=HUMAN_ORTHOLOGS,
                  force=force,
                  header=None,
                  usecols=[0, 7],
                  version=version).values)
    mouse_orthologs = multisetdict(
        ensure_df(PREFIX,
                  url=MOUSE_ORTHOLOGS,
                  force=force,
                  header=None,
                  usecols=[0, 5],
                  version=version).values)
    fly_orthologs = multisetdict(
        ensure_df(PREFIX,
                  url=FLY_ORTHOLOGS,
                  force=force,
                  header=None,
                  usecols=[0, 5],
                  version=version).values)
    entrez_mappings = dict(
        ensure_df(PREFIX,
                  url=ENTREZ_MAPPINGS,
                  force=force,
                  header=None,
                  usecols=[0, 3],
                  version=version).values)
    uniprot_mappings = multidict(
        ensure_df(PREFIX,
                  url=UNIPROT_MAPPINGS,
                  force=force,
                  header=None,
                  usecols=[0, 3],
                  version=version).values)

    df = ensure_df(
        PREFIX,
        url=URL,
        name="markers.tsv",
        force=force,
        header=None,
        names=MARKERS_COLUMNS,
        version=version,
    )
    df["sequence_ontology_id"] = df["sequence_ontology_id"].map(
        lambda x: x[len("SO:"):])
    so = {
        sequence_ontology_id: Reference.auto(prefix="SO",
                                             identifier=sequence_ontology_id)
        for sequence_ontology_id in df["sequence_ontology_id"].unique()
    }
    for _, reference in sorted(so.items()):
        yield Term(reference=reference)
    for identifier, name, definition, _entity_type, sequence_ontology_id in tqdm(
            df.values):
        term = Term.from_triple(
            prefix=PREFIX,
            identifier=identifier,
            name=name,
            definition=definition if definition != name else None,
        )
        term.set_species(identifier="7955", name="Danio rerio")
        term.append_parent(so[sequence_ontology_id])
        # Entity type is redundant of identifier
        # term.append_property("type", entity_type)
        for alt_id in primary_to_alt_ids[identifier]:
            term.append_alt(alt_id)
        entrez_id = entrez_mappings.get(identifier)
        if entrez_id:
            term.append_xref(Reference("ncbigene", entrez_id))
        for uniprot_id in uniprot_mappings.get(identifier, []):
            term.append_relationship(has_gene_product,
                                     Reference.auto("uniprot", uniprot_id))
        for hgnc_id in human_orthologs.get(identifier, []):
            term.append_relationship(orthologous,
                                     Reference.auto("hgnc", hgnc_id))
        for mgi_curie in mouse_orthologs.get(identifier, []):
            mouse_ortholog = Reference.from_curie(mgi_curie, auto=True)
            if mouse_ortholog:
                term.append_relationship(orthologous, mouse_ortholog)
        for flybase_id in fly_orthologs.get(identifier, []):
            term.append_relationship(orthologous,
                                     Reference("flybase", flybase_id))

        yield term
Exemplo n.º 28
0
def get_terms(version: str, force: bool = False) -> Iterable[Term]:
    """Get the FamPlex terms."""
    base_url = f"https://raw.githubusercontent.com/sorgerlab/famplex/{version}"

    entities_url = f"{base_url}/entities.csv"
    entities_df = ensure_df(PREFIX,
                            url=entities_url,
                            version=version,
                            dtype=str,
                            force=force)

    relations_url = f"{base_url}/relations.csv"
    relations_df = ensure_df(PREFIX,
                             url=relations_url,
                             version=version,
                             header=None,
                             sep=",",
                             dtype=str,
                             force=force)

    definitions_url = f"{base_url}/descriptions.csv"
    definitions_df = ensure_df(
        PREFIX,
        url=definitions_url,
        version=version,
        header=None,
        sep=",",
        dtype=str,
        force=force,
    )
    id_to_definition = {
        identifier: (definition, provenance)
        for identifier, provenance, definition in definitions_df.values
    }

    id_xrefs = _get_xref_df(version)

    hgnc_name_to_id = get_name_id_mapping("hgnc")
    in_edges = defaultdict(list)
    out_edges = defaultdict(list)
    for h_ns, h_name, r, t_ns, t_name in relations_df.values:
        if h_ns == "HGNC":
            h_identifier = hgnc_name_to_id.get(h_name)
            if h_identifier is None:
                logger.warning(
                    "[%s] could not look up HGNC identifier for gene: %s",
                    PREFIX, h_name)
            h = Reference(prefix="hgnc", identifier=h_identifier, name=h_name)
        elif h_ns == "FPLX":
            h = Reference(prefix="fplx", identifier=h_name, name=h_name)
        elif h_ns == "UP":
            continue
        else:
            logger.exception(h_ns)
            raise
        if t_ns == "HGNC":
            t_identifier = hgnc_name_to_id.get(t_name)
            if t_identifier is None:
                logger.warning(
                    "[%s] could not look up HGNC identifier for gene: %s",
                    PREFIX, t_name)
            t = Reference(prefix="hgnc", identifier=t_identifier, name=t_name)
        elif t_ns == "FPLX":
            t = Reference(prefix="fplx", identifier=t_name, name=t_name)
        elif h_ns == "UP":
            continue
        else:
            raise

        out_edges[h].append((r, t))
        in_edges[t].append((r, h))

    for (entity, ) in entities_df.values:
        reference = Reference(prefix=PREFIX, identifier=entity, name=entity)
        definition, provenance = id_to_definition.get(entity, (None, None))
        term = Term(
            reference=reference,
            definition=definition,
            provenance=[Reference.from_curie(provenance)]
            if definition is not None else None,
        )

        for xref_reference in id_xrefs.get(entity, []):
            term.append_xref(xref_reference)

        for r, t in out_edges.get(reference, []):
            if r == "isa" and t.prefix == "fplx":
                term.append_parent(t)
            elif r == "isa":
                term.append_relationship(is_a, t)
            elif r == "partof":
                term.append_relationship(part_of, t)
            else:
                logging.warning("unhandled relation %s", r)

        for r, h in in_edges.get(reference, []):
            if r == "isa":
                term.append_relationship(has_member, h)
            elif r == "partof":
                term.append_relationship(has_part, h)
            else:
                logging.warning("unhandled relation %s", r)
        yield term
Exemplo n.º 29
0
def get_terms(version: str, force: bool = False) -> Iterable[Term]:
    """Get the FamPlex terms."""
    base_url = f'https://raw.githubusercontent.com/sorgerlab/famplex/{version}'

    entities_url = f'{base_url}/entities.csv'
    entities_df = ensure_df(PREFIX, url=entities_url, version=version, dtype=str, force=force)

    relations_url = f'{base_url}/relations.csv'
    relations_df = ensure_df(PREFIX, url=relations_url, version=version, header=None, sep=',', dtype=str, force=force)

    definitions_url = f'{base_url}/descriptions.csv'
    definitions_df = ensure_df(
        PREFIX, url=definitions_url, version=version, header=None, sep=',', dtype=str, force=force,
    )
    id_to_definition = {
        identifier: (definition, provenance)
        for identifier, provenance, definition in definitions_df.values
    }

    # TODO add xrefs
    # xrefs_url = f'https://raw.githubusercontent.com/sorgerlab/famplex/{version}/equivalences.csv'
    # xrefs_df = ensure_df(PREFIX, url=xrefs_url, version=version, header=None, sep=',', dtype=str)

    hgnc_name_to_id = get_name_id_mapping('hgnc')
    in_edges = defaultdict(list)
    out_edges = defaultdict(list)
    for h_ns, h_name, r, t_ns, t_name in relations_df.values:
        if h_ns == 'HGNC':
            h_identifier = hgnc_name_to_id.get(h_name)
            if h_identifier is None:
                logger.warning('[%s] could not look up HGNC identifier for gene: %s', PREFIX, h_name)
            h = Reference(prefix='hgnc', identifier=h_identifier, name=h_name)
        elif h_ns == 'FPLX':
            h = Reference(prefix='fplx', identifier=h_name, name=h_name)
        elif h_ns == 'UP':
            continue
        else:
            logger.exception(h_ns)
            raise
        if t_ns == 'HGNC':
            t_identifier = hgnc_name_to_id.get(t_name)
            if t_identifier is None:
                logger.warning('[%s] could not look up HGNC identifier for gene: %s', PREFIX, t_name)
            t = Reference(prefix='hgnc', identifier=t_identifier, name=t_name)
        elif t_ns == 'FPLX':
            t = Reference(prefix='fplx', identifier=t_name, name=t_name)
        elif h_ns == 'UP':
            continue
        else:
            raise

        out_edges[h].append((r, t))
        in_edges[t].append((r, h))

    for entity, in entities_df.values:
        reference = Reference(prefix=PREFIX, identifier=entity, name=entity)
        definition, provenance = id_to_definition.get(entity, (None, None))
        term = Term(
            reference=reference,
            definition=definition,
            provenance=[Reference.from_curie(provenance)] if definition is not None else None,
        )

        for r, t in out_edges.get(reference, []):
            if r == 'isa' and t.prefix == 'fplx':
                term.append_parent(t)
            elif r == 'isa':
                term.append_relationship(is_a, t)
            elif r == 'partof':
                term.append_relationship(part_of, t)
            else:
                logging.warning('unhandled relation %s', r)

        for r, h in in_edges.get(reference, []):
            if r == 'isa':
                term.append_relationship(has_member, h)
            elif r == 'partof':
                term.append_relationship(has_part, h)
            else:
                logging.warning('unhandled relation %s', r)
        yield term
Exemplo n.º 30
0
def iter_terms(version: str) -> Iterable[Term]:
    """Iterate over terms in Rhea."""
    terms = {}

    directions = ensure_df(
        PREFIX,
        url='ftp://ftp.expasy.org/databases/rhea/tsv/rhea-directions.tsv',
        version=version)
    for master, lr, rl, bi in directions.values:
        terms[master] = Term(reference=Reference(PREFIX, master))
        terms[lr] = Term(reference=Reference(PREFIX, lr))
        terms[rl] = Term(reference=Reference(PREFIX, rl))
        terms[bi] = Term(reference=Reference(PREFIX, bi))

        terms[master].append_relationship(has_lr, terms[lr])
        terms[master].append_relationship(has_rl, terms[rl])
        terms[master].append_relationship(has_bi, terms[bi])
        terms[lr].append_parent(terms[master])
        terms[rl].append_parent(terms[master])
        terms[bi].append_parent(terms[master])

    hierarchy = ensure_df(
        PREFIX,
        url='ftp://ftp.expasy.org/databases/rhea/tsv/rhea-relationships.tsv',
        version=version)
    for source, relation, target in hierarchy.values:
        if relation != 'is_a':
            raise ValueError(f'RHEA unrecognized relation: {relation}')
        terms[source].append_parent(terms[target])

    for xref_prefix, url in [
        ('ecocyc', 'rhea2ecocyc'),
        ('kegg.reaction', 'rhea2kegg_reaction'),
        ('reactome', 'rhea2reactome'),
        ('macie', 'rhea2macie'),
        ('metacyc', 'rhea2metacyc'),
    ]:
        xref_df = ensure_df(
            PREFIX,
            url=f'ftp://ftp.expasy.org/databases/rhea/tsv/{url}.tsv',
            version=version)
        for rhea_id, _, _, xref_id in xref_df.values:
            if rhea_id not in terms:
                logger.warning('[%s] could not find %s:%s for xref %s:%s',
                               PREFIX, PREFIX, rhea_id, xref_prefix, xref_id)
                continue
            terms[rhea_id].append_xref(Reference(xref_prefix, xref_id))

    # TODO are EC codes equivalent?
    # TODO uniprot enabled by (RO:0002333)
    # TODO names?

    url = 'ftp://ftp.expasy.org/databases/rhea/rdf/rhea.rdf.gz'
    graph = pystow.ensure_rdf('pyobo', 'raw', PREFIX, version, url=url)
    result = graph.query('''
    PREFIX rh:<http://rdf.rhea-db.org/>
    SELECT ?reaction ?reactionId ?reactionLabel WHERE {
      ?reaction rdfs:subClassOf rh:Reaction .
      ?reaction rh:id ?reactionId .
      ?reaction rdfs:label ?reactionLabel .
    }
    ''')
    for _, identifier, name in result:
        identifier = str(identifier)
        if identifier not in terms:
            logger.warning('isolated element in rdf: rhea:%s ! %s', identifier,
                           name)
            continue
        terms[identifier].reference.name = name

    # TODO participants?

    yield from terms.values()