示例#1
0
def get_terms(version: str, force: bool = False) -> Iterable[Term]:
    """Get terms."""
    orthologs_df = ensure_df(PREFIX,
                             url=ORTHOLOGS_URL,
                             force=force,
                             header=None,
                             version=version)
    identifier_to_hgnc_ids = defaultdict(set)
    hgnc_symbol_to_id = pyobo.get_name_id_mapping("hgnc")
    for identifier, hgnc_symbols in orthologs_df.values:
        if hgnc_symbols == "NONE":
            continue
        for hgnc_symbol in hgnc_symbols.split("|"):
            hgnc_id = hgnc_symbol_to_id.get(hgnc_symbol)
            if hgnc_id is not None:
                identifier_to_hgnc_ids[identifier].add(hgnc_id)

    df = ensure_df(PREFIX, url=URL, force=force, header=None, version=version)
    so = {
        gtype: Reference.auto("SO", POMBASE_TO_SO[gtype])
        for gtype in sorted(df[df.columns[6]].unique())
    }
    for _, reference in sorted(so.items()):
        yield Term(reference=reference)
    for identifier, _, symbol, chromosome, name, uniprot_id, gtype, synonyms in tqdm(
            df.values):
        term = Term.from_triple(
            prefix=PREFIX,
            identifier=identifier,
            name=symbol if pd.notna(symbol) else None,
            definition=name if pd.notna(name) else None,
        )
        term.append_property("chromosome", chromosome[len("chromosome_"):])
        term.append_parent(so[gtype])
        term.set_species(identifier="4896", name="Schizosaccharomyces pombe")
        for hgnc_id in identifier_to_hgnc_ids.get(identifier, []):
            term.append_relationship(orthologous,
                                     Reference.auto("hgnc", hgnc_id))
        if uniprot_id and pd.notna(uniprot_id):
            term.append_relationship(has_gene_product,
                                     Reference.auto("uniprot", uniprot_id))
        if synonyms and pd.notna(synonyms):
            for synonym in synonyms.split(","):
                term.append_synonym(Synonym(synonym))
        yield term
示例#2
0
def get_terms(version: str, force: bool = False) -> Iterable[Term]:
    """Get terms."""
    definitions = _get_definitions(version=version, force=force)
    abbr_to_taxonomy = _get_organisms(version=version, force=force)
    names_df = _get_names(version=version, force=force)
    human_orthologs = _get_human_orthologs(version=version, force=force)
    missing_taxonomies = set()

    so = {}
    for gtype in names_df[names_df.columns[1]].unique():
        so_id = GTYPE_TO_SO.get(gtype)
        if so_id is None:
            logger.warning(
                "FlyBase gene type is missing mapping to Sequence Ontology (SO): %s",
                gtype)
        else:
            so[gtype] = Reference.auto("SO", so_id)

    for _, reference in sorted(so.items()):
        yield Term(reference=reference)
    for organism, gtype, identifier, symbol, name in tqdm(names_df.values):
        term = Term.from_triple(
            prefix=PREFIX,
            identifier=identifier,
            name=symbol if pd.notna(symbol) else None,
            definition=definitions.get(identifier),
        )
        if gtype and pd.notna(gtype) and gtype in so:
            term.append_parent(so[gtype])
        if pd.notna(name):
            term.append_synonym(name)
        for hgnc_curie in human_orthologs.get(identifier, []):
            if not hgnc_curie or pd.isna(hgnc_curie):
                continue
            hgnc_ortholog = Reference.from_curie(hgnc_curie, auto=True)
            if hgnc_ortholog is None:
                tqdm.write(
                    f"fb:{identifier} had invalid ortholog: {hgnc_curie}")
            else:
                term.append_relationship(orthologous, hgnc_ortholog)
        taxonomy_id = abbr_to_taxonomy.get(organism)
        if taxonomy_id is not None:
            term.append_relationship(from_species,
                                     Reference(NCBITAXON_PREFIX, taxonomy_id))
        elif organism not in missing_taxonomies:
            tqdm.write(f"missing mapping for species abbreviation: {organism}")
            missing_taxonomies.add(organism)
        yield term

    if missing_taxonomies:
        tqdm.write(
            f"there were {len(missing_taxonomies)} missing taxa in flybase genes"
        )