def _parse(i, lines: Iterable[Tuple[str, str]]) -> Optional[Term]: dd_: DefaultDict[str, List[str]] = defaultdict(list) for key, value in lines: dd_[key].append(value) dd: Mapping[str, List[str]] = dict(dd_) if "//" in dd: return None accessions = dd["AC"] labels = dd.get("ID") reference = Reference( prefix="uniprot.ptm", identifier=accessions[0], name=labels[0] if labels else None, ) term = Term(reference=reference) for line in dd.get("DR", []): line = line.rstrip(".") for x, y in [ ("MOD; ", "PSI-MOD; MOD:"), ("CHEBI; ", "ChEBI; CHEBI:"), ]: if line.startswith(y): line = x + line[len(y):] ref = Reference.from_curie(line.replace("; ", ":")) if ref: term.append_xref(ref) else: tqdm.write(f"Failure on xref {line}") return term
def get_terms(version: str, force: bool = False) -> Iterable[Term]: """Get terms.""" definitions = _get_definitions(version=version, force=force) abbr_to_taxonomy = _get_organisms(version=version, force=force) names_df = _get_names(version=version, force=force) human_orthologs = _get_human_orthologs(version=version, force=force) missing_taxonomies = set() so = {} for gtype in names_df[names_df.columns[1]].unique(): so_id = GTYPE_TO_SO.get(gtype) if so_id is None: logger.warning( "FlyBase gene type is missing mapping to Sequence Ontology (SO): %s", gtype) else: so[gtype] = Reference.auto("SO", so_id) for _, reference in sorted(so.items()): yield Term(reference=reference) for organism, gtype, identifier, symbol, name in tqdm(names_df.values): term = Term.from_triple( prefix=PREFIX, identifier=identifier, name=symbol if pd.notna(symbol) else None, definition=definitions.get(identifier), ) if gtype and pd.notna(gtype) and gtype in so: term.append_parent(so[gtype]) if pd.notna(name): term.append_synonym(name) for hgnc_curie in human_orthologs.get(identifier, []): if not hgnc_curie or pd.isna(hgnc_curie): continue hgnc_ortholog = Reference.from_curie(hgnc_curie, auto=True) if hgnc_ortholog is None: tqdm.write( f"fb:{identifier} had invalid ortholog: {hgnc_curie}") else: term.append_relationship(orthologous, hgnc_ortholog) taxonomy_id = abbr_to_taxonomy.get(organism) if taxonomy_id is not None: term.append_relationship(from_species, Reference(NCBITAXON_PREFIX, taxonomy_id)) elif organism not in missing_taxonomies: tqdm.write(f"missing mapping for species abbreviation: {organism}") missing_taxonomies.add(organism) yield term if missing_taxonomies: tqdm.write( f"there were {len(missing_taxonomies)} missing taxa in flybase genes" )
def _extract_references(s: str) -> Iterable[Reference]: for curie in s.split(','): reference = Reference.from_curie(curie) if reference is not None: yield reference