示例#1
0
def _parse(i, lines: Iterable[Tuple[str, str]]) -> Optional[Term]:
    dd_: DefaultDict[str, List[str]] = defaultdict(list)
    for key, value in lines:
        dd_[key].append(value)
    dd: Mapping[str, List[str]] = dict(dd_)

    if "//" in dd:
        return None

    accessions = dd["AC"]
    labels = dd.get("ID")
    reference = Reference(
        prefix="uniprot.ptm",
        identifier=accessions[0],
        name=labels[0] if labels else None,
    )
    term = Term(reference=reference)
    for line in dd.get("DR", []):
        line = line.rstrip(".")
        for x, y in [
            ("MOD; ", "PSI-MOD; MOD:"),
            ("CHEBI; ", "ChEBI; CHEBI:"),
        ]:
            if line.startswith(y):
                line = x + line[len(y):]

        ref = Reference.from_curie(line.replace("; ", ":"))
        if ref:
            term.append_xref(ref)
        else:
            tqdm.write(f"Failure on xref {line}")
    return term
示例#2
0
def get_terms(version: str, force: bool = False) -> Iterable[Term]:
    """Get terms."""
    definitions = _get_definitions(version=version, force=force)
    abbr_to_taxonomy = _get_organisms(version=version, force=force)
    names_df = _get_names(version=version, force=force)
    human_orthologs = _get_human_orthologs(version=version, force=force)
    missing_taxonomies = set()

    so = {}
    for gtype in names_df[names_df.columns[1]].unique():
        so_id = GTYPE_TO_SO.get(gtype)
        if so_id is None:
            logger.warning(
                "FlyBase gene type is missing mapping to Sequence Ontology (SO): %s",
                gtype)
        else:
            so[gtype] = Reference.auto("SO", so_id)

    for _, reference in sorted(so.items()):
        yield Term(reference=reference)
    for organism, gtype, identifier, symbol, name in tqdm(names_df.values):
        term = Term.from_triple(
            prefix=PREFIX,
            identifier=identifier,
            name=symbol if pd.notna(symbol) else None,
            definition=definitions.get(identifier),
        )
        if gtype and pd.notna(gtype) and gtype in so:
            term.append_parent(so[gtype])
        if pd.notna(name):
            term.append_synonym(name)
        for hgnc_curie in human_orthologs.get(identifier, []):
            if not hgnc_curie or pd.isna(hgnc_curie):
                continue
            hgnc_ortholog = Reference.from_curie(hgnc_curie, auto=True)
            if hgnc_ortholog is None:
                tqdm.write(
                    f"fb:{identifier} had invalid ortholog: {hgnc_curie}")
            else:
                term.append_relationship(orthologous, hgnc_ortholog)
        taxonomy_id = abbr_to_taxonomy.get(organism)
        if taxonomy_id is not None:
            term.append_relationship(from_species,
                                     Reference(NCBITAXON_PREFIX, taxonomy_id))
        elif organism not in missing_taxonomies:
            tqdm.write(f"missing mapping for species abbreviation: {organism}")
            missing_taxonomies.add(organism)
        yield term

    if missing_taxonomies:
        tqdm.write(
            f"there were {len(missing_taxonomies)} missing taxa in flybase genes"
        )
示例#3
0
def _extract_references(s: str) -> Iterable[Reference]:
    for curie in s.split(','):
        reference = Reference.from_curie(curie)
        if reference is not None:
            yield reference