def get_terms(version: str, force: bool = False) -> Iterable[Term]: """Get terms.""" orthologs_df = ensure_df(PREFIX, url=ORTHOLOGS_URL, force=force, header=None, version=version) identifier_to_hgnc_ids = defaultdict(set) hgnc_symbol_to_id = pyobo.get_name_id_mapping("hgnc") for identifier, hgnc_symbols in orthologs_df.values: if hgnc_symbols == "NONE": continue for hgnc_symbol in hgnc_symbols.split("|"): hgnc_id = hgnc_symbol_to_id.get(hgnc_symbol) if hgnc_id is not None: identifier_to_hgnc_ids[identifier].add(hgnc_id) df = ensure_df(PREFIX, url=URL, force=force, header=None, version=version) so = { gtype: Reference.auto("SO", POMBASE_TO_SO[gtype]) for gtype in sorted(df[df.columns[6]].unique()) } for _, reference in sorted(so.items()): yield Term(reference=reference) for identifier, _, symbol, chromosome, name, uniprot_id, gtype, synonyms in tqdm( df.values): term = Term.from_triple( prefix=PREFIX, identifier=identifier, name=symbol if pd.notna(symbol) else None, definition=name if pd.notna(name) else None, ) term.append_property("chromosome", chromosome[len("chromosome_"):]) term.append_parent(so[gtype]) term.set_species(identifier="4896", name="Schizosaccharomyces pombe") for hgnc_id in identifier_to_hgnc_ids.get(identifier, []): term.append_relationship(orthologous, Reference.auto("hgnc", hgnc_id)) if uniprot_id and pd.notna(uniprot_id): term.append_relationship(has_gene_product, Reference.auto("uniprot", uniprot_id)) if synonyms and pd.notna(synonyms): for synonym in synonyms.split(","): term.append_synonym(Synonym(synonym)) yield term
def get_terms(version: str, force: bool = False) -> Iterable[Term]: """Get terms.""" definitions = _get_definitions(version=version, force=force) abbr_to_taxonomy = _get_organisms(version=version, force=force) names_df = _get_names(version=version, force=force) human_orthologs = _get_human_orthologs(version=version, force=force) missing_taxonomies = set() so = {} for gtype in names_df[names_df.columns[1]].unique(): so_id = GTYPE_TO_SO.get(gtype) if so_id is None: logger.warning( "FlyBase gene type is missing mapping to Sequence Ontology (SO): %s", gtype) else: so[gtype] = Reference.auto("SO", so_id) for _, reference in sorted(so.items()): yield Term(reference=reference) for organism, gtype, identifier, symbol, name in tqdm(names_df.values): term = Term.from_triple( prefix=PREFIX, identifier=identifier, name=symbol if pd.notna(symbol) else None, definition=definitions.get(identifier), ) if gtype and pd.notna(gtype) and gtype in so: term.append_parent(so[gtype]) if pd.notna(name): term.append_synonym(name) for hgnc_curie in human_orthologs.get(identifier, []): if not hgnc_curie or pd.isna(hgnc_curie): continue hgnc_ortholog = Reference.from_curie(hgnc_curie, auto=True) if hgnc_ortholog is None: tqdm.write( f"fb:{identifier} had invalid ortholog: {hgnc_curie}") else: term.append_relationship(orthologous, hgnc_ortholog) taxonomy_id = abbr_to_taxonomy.get(organism) if taxonomy_id is not None: term.append_relationship(from_species, Reference(NCBITAXON_PREFIX, taxonomy_id)) elif organism not in missing_taxonomies: tqdm.write(f"missing mapping for species abbreviation: {organism}") missing_taxonomies.add(organism) yield term if missing_taxonomies: tqdm.write( f"there were {len(missing_taxonomies)} missing taxa in flybase genes" )