示例#1
0
def _parse_xrefs(s) -> List[Tuple[Reference, str]]:
    if pd.isna(s):
        return []

    rv = []
    for xref in s.split("|"):
        xref = xref.replace("protein ontology:PR:", "PR:")
        xref = xref.replace("protein ontology:PR_", "PR:")
        try:
            xref_curie, note = xref.split("(")
        except ValueError:
            logger.warning("xref missing (: %s", xref)
            continue
        note = note.rstrip(")")
        try:
            reference = Reference.from_curie(xref_curie)
        except ValueError:
            logger.warning("can not parse CURIE: %s", xref)
            continue
        if reference is None:
            logger.warning("reference is None after parsing: %s", xref)
            continue
        rv.append((reference, note))
    return rv
示例#2
0
def _parse_xrefs(s) -> List[Tuple[Reference, str]]:
    if pd.isna(s):
        return []

    rv = []
    for xref in s.split('|'):
        xref = xref.replace('protein ontology:PR:', 'PR:')
        xref = xref.replace('protein ontology:PR_', 'PR:')
        try:
            xref_curie, note = xref.split('(')
        except ValueError:
            logger.warning('xref missing (: %s', xref)
            continue
        note = note.rstrip(')')
        try:
            reference = Reference.from_curie(xref_curie)
        except ValueError:
            logger.warning('can not parse CURIE: %s', xref)
            continue
        if reference is None:
            logger.warning('reference is None after parsing: %s', xref)
            continue
        rv.append((reference, note))
    return rv
示例#3
0
def get_terms(version: str, force: bool = False) -> Iterable[Term]:
    """Get the FamPlex terms."""
    base_url = f'https://raw.githubusercontent.com/sorgerlab/famplex/{version}'

    entities_url = f'{base_url}/entities.csv'
    entities_df = ensure_df(PREFIX, url=entities_url, version=version, dtype=str, force=force)

    relations_url = f'{base_url}/relations.csv'
    relations_df = ensure_df(PREFIX, url=relations_url, version=version, header=None, sep=',', dtype=str, force=force)

    definitions_url = f'{base_url}/descriptions.csv'
    definitions_df = ensure_df(
        PREFIX, url=definitions_url, version=version, header=None, sep=',', dtype=str, force=force,
    )
    id_to_definition = {
        identifier: (definition, provenance)
        for identifier, provenance, definition in definitions_df.values
    }

    # TODO add xrefs
    # xrefs_url = f'https://raw.githubusercontent.com/sorgerlab/famplex/{version}/equivalences.csv'
    # xrefs_df = ensure_df(PREFIX, url=xrefs_url, version=version, header=None, sep=',', dtype=str)

    hgnc_name_to_id = get_name_id_mapping('hgnc')
    in_edges = defaultdict(list)
    out_edges = defaultdict(list)
    for h_ns, h_name, r, t_ns, t_name in relations_df.values:
        if h_ns == 'HGNC':
            h_identifier = hgnc_name_to_id.get(h_name)
            if h_identifier is None:
                logger.warning('[%s] could not look up HGNC identifier for gene: %s', PREFIX, h_name)
            h = Reference(prefix='hgnc', identifier=h_identifier, name=h_name)
        elif h_ns == 'FPLX':
            h = Reference(prefix='fplx', identifier=h_name, name=h_name)
        elif h_ns == 'UP':
            continue
        else:
            logger.exception(h_ns)
            raise
        if t_ns == 'HGNC':
            t_identifier = hgnc_name_to_id.get(t_name)
            if t_identifier is None:
                logger.warning('[%s] could not look up HGNC identifier for gene: %s', PREFIX, t_name)
            t = Reference(prefix='hgnc', identifier=t_identifier, name=t_name)
        elif t_ns == 'FPLX':
            t = Reference(prefix='fplx', identifier=t_name, name=t_name)
        elif h_ns == 'UP':
            continue
        else:
            raise

        out_edges[h].append((r, t))
        in_edges[t].append((r, h))

    for entity, in entities_df.values:
        reference = Reference(prefix=PREFIX, identifier=entity, name=entity)
        definition, provenance = id_to_definition.get(entity, (None, None))
        term = Term(
            reference=reference,
            definition=definition,
            provenance=[Reference.from_curie(provenance)] if definition is not None else None,
        )

        for r, t in out_edges.get(reference, []):
            if r == 'isa' and t.prefix == 'fplx':
                term.append_parent(t)
            elif r == 'isa':
                term.append_relationship(is_a, t)
            elif r == 'partof':
                term.append_relationship(part_of, t)
            else:
                logging.warning('unhandled relation %s', r)

        for r, h in in_edges.get(reference, []):
            if r == 'isa':
                term.append_relationship(has_member, h)
            elif r == 'partof':
                term.append_relationship(has_part, h)
            else:
                logging.warning('unhandled relation %s', r)
        yield term
示例#4
0
def get_terms(version: str, force: bool = False) -> Iterable[Term]:
    """Get the FamPlex terms."""
    base_url = f"https://raw.githubusercontent.com/sorgerlab/famplex/{version}"

    entities_url = f"{base_url}/entities.csv"
    entities_df = ensure_df(PREFIX,
                            url=entities_url,
                            version=version,
                            dtype=str,
                            force=force)

    relations_url = f"{base_url}/relations.csv"
    relations_df = ensure_df(PREFIX,
                             url=relations_url,
                             version=version,
                             header=None,
                             sep=",",
                             dtype=str,
                             force=force)

    definitions_url = f"{base_url}/descriptions.csv"
    definitions_df = ensure_df(
        PREFIX,
        url=definitions_url,
        version=version,
        header=None,
        sep=",",
        dtype=str,
        force=force,
    )
    id_to_definition = {
        identifier: (definition, provenance)
        for identifier, provenance, definition in definitions_df.values
    }

    id_xrefs = _get_xref_df(version)

    hgnc_name_to_id = get_name_id_mapping("hgnc")
    in_edges = defaultdict(list)
    out_edges = defaultdict(list)
    for h_ns, h_name, r, t_ns, t_name in relations_df.values:
        if h_ns == "HGNC":
            h_identifier = hgnc_name_to_id.get(h_name)
            if h_identifier is None:
                logger.warning(
                    "[%s] could not look up HGNC identifier for gene: %s",
                    PREFIX, h_name)
            h = Reference(prefix="hgnc", identifier=h_identifier, name=h_name)
        elif h_ns == "FPLX":
            h = Reference(prefix="fplx", identifier=h_name, name=h_name)
        elif h_ns == "UP":
            continue
        else:
            logger.exception(h_ns)
            raise
        if t_ns == "HGNC":
            t_identifier = hgnc_name_to_id.get(t_name)
            if t_identifier is None:
                logger.warning(
                    "[%s] could not look up HGNC identifier for gene: %s",
                    PREFIX, t_name)
            t = Reference(prefix="hgnc", identifier=t_identifier, name=t_name)
        elif t_ns == "FPLX":
            t = Reference(prefix="fplx", identifier=t_name, name=t_name)
        elif h_ns == "UP":
            continue
        else:
            raise

        out_edges[h].append((r, t))
        in_edges[t].append((r, h))

    for (entity, ) in entities_df.values:
        reference = Reference(prefix=PREFIX, identifier=entity, name=entity)
        definition, provenance = id_to_definition.get(entity, (None, None))
        term = Term(
            reference=reference,
            definition=definition,
            provenance=[Reference.from_curie(provenance)]
            if definition is not None else None,
        )

        for xref_reference in id_xrefs.get(entity, []):
            term.append_xref(xref_reference)

        for r, t in out_edges.get(reference, []):
            if r == "isa" and t.prefix == "fplx":
                term.append_parent(t)
            elif r == "isa":
                term.append_relationship(is_a, t)
            elif r == "partof":
                term.append_relationship(part_of, t)
            else:
                logging.warning("unhandled relation %s", r)

        for r, h in in_edges.get(reference, []):
            if r == "isa":
                term.append_relationship(has_member, h)
            elif r == "partof":
                term.append_relationship(has_part, h)
            else:
                logging.warning("unhandled relation %s", r)
        yield term
示例#5
0
def get_terms(force: bool = False,
              version: Optional[str] = None) -> Iterable[Term]:
    """Get terms."""
    alt_ids_df = ensure_df(
        PREFIX,
        url=ALTS_URL,
        name="alts.tsv",
        force=force,
        header=None,
        names=["alt", "zfin_id"],
        version=version,
    )
    primary_to_alt_ids = defaultdict(set)
    for alt_id, zfin_id in alt_ids_df.values:
        primary_to_alt_ids[zfin_id].add(alt_id)

    human_orthologs = multisetdict(
        ensure_df(PREFIX,
                  url=HUMAN_ORTHOLOGS,
                  force=force,
                  header=None,
                  usecols=[0, 7],
                  version=version).values)
    mouse_orthologs = multisetdict(
        ensure_df(PREFIX,
                  url=MOUSE_ORTHOLOGS,
                  force=force,
                  header=None,
                  usecols=[0, 5],
                  version=version).values)
    fly_orthologs = multisetdict(
        ensure_df(PREFIX,
                  url=FLY_ORTHOLOGS,
                  force=force,
                  header=None,
                  usecols=[0, 5],
                  version=version).values)
    entrez_mappings = dict(
        ensure_df(PREFIX,
                  url=ENTREZ_MAPPINGS,
                  force=force,
                  header=None,
                  usecols=[0, 3],
                  version=version).values)
    uniprot_mappings = multidict(
        ensure_df(PREFIX,
                  url=UNIPROT_MAPPINGS,
                  force=force,
                  header=None,
                  usecols=[0, 3],
                  version=version).values)

    df = ensure_df(
        PREFIX,
        url=URL,
        name="markers.tsv",
        force=force,
        header=None,
        names=MARKERS_COLUMNS,
        version=version,
    )
    df["sequence_ontology_id"] = df["sequence_ontology_id"].map(
        lambda x: x[len("SO:"):])
    so = {
        sequence_ontology_id: Reference.auto(prefix="SO",
                                             identifier=sequence_ontology_id)
        for sequence_ontology_id in df["sequence_ontology_id"].unique()
    }
    for _, reference in sorted(so.items()):
        yield Term(reference=reference)
    for identifier, name, definition, _entity_type, sequence_ontology_id in tqdm(
            df.values):
        term = Term.from_triple(
            prefix=PREFIX,
            identifier=identifier,
            name=name,
            definition=definition if definition != name else None,
        )
        term.set_species(identifier="7955", name="Danio rerio")
        term.append_parent(so[sequence_ontology_id])
        # Entity type is redundant of identifier
        # term.append_property("type", entity_type)
        for alt_id in primary_to_alt_ids[identifier]:
            term.append_alt(alt_id)
        entrez_id = entrez_mappings.get(identifier)
        if entrez_id:
            term.append_xref(Reference("ncbigene", entrez_id))
        for uniprot_id in uniprot_mappings.get(identifier, []):
            term.append_relationship(has_gene_product,
                                     Reference.auto("uniprot", uniprot_id))
        for hgnc_id in human_orthologs.get(identifier, []):
            term.append_relationship(orthologous,
                                     Reference.auto("hgnc", hgnc_id))
        for mgi_curie in mouse_orthologs.get(identifier, []):
            mouse_ortholog = Reference.from_curie(mgi_curie, auto=True)
            if mouse_ortholog:
                term.append_relationship(orthologous, mouse_ortholog)
        for flybase_id in fly_orthologs.get(identifier, []):
            term.append_relationship(orthologous,
                                     Reference("flybase", flybase_id))

        yield term