def get_terms() -> Iterable[Term]: """Get ComplexPortal terms.""" df = get_df() df['aliases'] = df['aliases'].map(lambda s: s.split('|') if pd.notna(s) else []) df['members'] = df['members'].map(_parse_members) df['xrefs'] = df['xrefs'].map(_parse_xrefs) taxnomy_id_to_name = get_id_name_mapping('ncbitaxon') df['taxonomy_name'] = df['taxonomy_id'].map(taxnomy_id_to_name.get) slim_df = df[[ 'complexportal_id', 'name', 'definition', 'aliases', 'xrefs', 'taxonomy_id', 'taxonomy_name', 'members', ]] it = tqdm(slim_df.values, total=len(slim_df.index), desc=f'mapping {PREFIX}') unhandled_xref_type = set() for complexportal_id, name, definition, aliases, xrefs, taxonomy_id, taxonomy_name, members in it: synonyms = [Synonym(name=alias) for alias in aliases] _xrefs = [] provenance = [] for reference, note in xrefs: if note == 'identity': _xrefs.append(reference) elif note == 'see-also' and reference.prefix == 'pubmed': provenance.append(reference) elif (note, reference.prefix) not in unhandled_xref_type: logger.debug( f'unhandled xref type: {note} / {reference.prefix}') unhandled_xref_type.add((note, reference.prefix)) term = Term( reference=Reference(prefix=PREFIX, identifier=complexportal_id, name=name), definition=definition.strip(), synonyms=synonyms, xrefs=_xrefs, provenance=provenance, ) term.set_species(identifier=taxonomy_id, name=taxonomy_name) for reference, _count in members: term.append_relationship(has_part, reference) yield term
def get_terms(version: str) -> Iterable[Term]: """Get ComplexPortal terms.""" df = get_df(version=version) df.rename( inplace=True, columns={ "Aliases for complex": "aliases", "Identifiers (and stoichiometry) of molecules in complex": "members", "Taxonomy identifier": "taxonomy_id", "Cross references": "xrefs", "Description": "definition", "Recommended name": "name", "#Complex ac": "complexportal_id", }, ) df["aliases"] = df["aliases"].map(lambda s: s.split("|") if pd.notna(s) else []) df["members"] = df["members"].map(_parse_members) df["xrefs"] = df["xrefs"].map(_parse_xrefs) taxnomy_id_to_name = get_id_name_mapping("ncbitaxon") df["taxonomy_name"] = df["taxonomy_id"].map(taxnomy_id_to_name.get) slim_df = df[ [ "complexportal_id", "name", "definition", "aliases", "xrefs", "taxonomy_id", "taxonomy_name", "members", ] ] it = tqdm(slim_df.values, total=len(slim_df.index), desc=f"mapping {PREFIX}") unhandled_xref_type = set() for ( complexportal_id, name, definition, aliases, xrefs, taxonomy_id, taxonomy_name, members, ) in it: synonyms = [Synonym(name=alias) for alias in aliases] _xrefs = [] provenance = [] for reference, note in xrefs: if note == "identity": _xrefs.append(reference) elif note == "see-also" and reference.prefix == "pubmed": provenance.append(reference) elif (note, reference.prefix) not in unhandled_xref_type: logger.debug(f"unhandled xref type: {note} / {reference.prefix}") unhandled_xref_type.add((note, reference.prefix)) term = Term( reference=Reference(prefix=PREFIX, identifier=complexportal_id, name=name), definition=definition.strip() if pd.notna(definition) else None, synonyms=synonyms, xrefs=_xrefs, provenance=provenance, ) term.set_species(identifier=taxonomy_id, name=taxonomy_name) for reference, _count in members: term.append_relationship(has_part, reference) yield term
def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Term]: # noqa:C901 """Get HGNC terms.""" if version is None: version = datetime.date.today().strftime("%Y-%m-01") unhandled_entry_keys: typing.Counter[str] = Counter() unhandle_locus_types: DefaultDict[str, Dict[str, Term]] = defaultdict(dict) path = ensure_path( PREFIX, url=DEFINITIONS_URL_FMT.format(version=version), force=force, version=version, name="hgnc_complete_set.json", ) with open(path) as file: entries = json.load(file)["response"]["docs"] yield from sorted( { Term(reference=Reference.auto("SO", so_id)) for so_id in sorted(LOCUS_TYPE_TO_SO.values()) if so_id }, key=attrgetter("identifier"), ) statuses = set() for entry in tqdm(entries, desc=f"Mapping {PREFIX}", unit="gene", unit_scale=True): name, symbol, identifier = ( entry.pop("name"), entry.pop("symbol"), entry.pop("hgnc_id")[len("HGNC:"):], ) status = entry.pop("status") if status == "Approved": is_obsolete = False elif status not in statuses: statuses.add(status) logger.warning("UNHANDLED %s", status) is_obsolete = True else: raise ValueError( f"Unhandled status for hgnc:{identifier}: {status}") term = Term( definition=name, reference=Reference(prefix=PREFIX, identifier=identifier, name=symbol), is_obsolete=is_obsolete, ) for uniprot_id in entry.pop("uniprot_ids", []): term.append_relationship( has_gene_product, Reference.auto("uniprot", uniprot_id), ) for ec_code in entry.pop("enzyme_id", []): if "-" in ec_code: continue # only add concrete annotations term.append_relationship( gene_product_member_of, Reference.auto("eccode", ec_code), ) for rna_central_ids in entry.pop("rna_central_id", []): for rna_central_id in rna_central_ids.split(","): term.append_relationship( transcribes_to, Reference(prefix="rnacentral", identifier=rna_central_id.strip()), ) mirbase_id = entry.pop("mirbase", None) if mirbase_id: term.append_relationship( transcribes_to, Reference.auto( "mirbase", mirbase_id, ), ) snornabase_id = entry.pop("snornabase", None) if snornabase_id: term.append_relationship( transcribes_to, Reference(prefix="snornabase", identifier=snornabase_id)) for rgd_curie in entry.pop("rgd_id", []): if not rgd_curie.startswith("RGD:"): logger.warning( f"hgnc:{identifier} had bad RGD CURIE: {rgd_curie}") continue rgd_id = rgd_curie[len("RGD:"):] term.append_relationship( orthologous, Reference.auto(prefix="rgd", identifier=rgd_id), ) for mgi_curie in entry.pop("mgd_id", []): if not mgi_curie.startswith("MGI:"): logger.warning( f"hgnc:{identifier} had bad MGI CURIE: {mgi_curie}") continue mgi_id = mgi_curie[len("MGI:"):] if not mgi_id: continue term.append_relationship( orthologous, Reference.auto(prefix="mgi", identifier=mgi_id), ) for xref_prefix, key in gene_xrefs: xref_identifiers = entry.pop(key, None) if xref_identifiers is None: continue if not isinstance(xref_identifiers, list): xref_identifiers = [xref_identifiers] for xref_identifier in xref_identifiers: term.append_xref( Reference(prefix=xref_prefix, identifier=str(xref_identifier))) for pubmed_id in entry.pop("pubmed_id", []): term.append_provenance( Reference(prefix="pubmed", identifier=str(pubmed_id))) gene_group_ids = entry.pop("gene_group_id", []) gene_groups = entry.pop("gene_group", []) for gene_group_id, gene_group_label in zip(gene_group_ids, gene_groups): term.append_relationship( member_of, Reference( prefix="hgnc.genegroup", identifier=str(gene_group_id), name=gene_group_label, ), ) for alias_symbol in entry.pop("alias_symbol", []): term.append_synonym( Synonym(name=alias_symbol, type=alias_symbol_type)) for alias_name in entry.pop("alias_name", []): term.append_synonym(Synonym(name=alias_name, type=alias_name_type)) for previous_symbol in entry.pop("previous_symbol", []): term.append_synonym( Synonym(name=previous_symbol, type=previous_symbol_type)) for previous_name in entry.pop("prev_name", []): term.append_synonym( Synonym(name=previous_name, type=previous_name_type)) for prop in ["location"]: value = entry.pop(prop, None) if value: term.append_property(prop, value) locus_type = entry.pop("locus_type") locus_group = entry.pop("locus_group") so_id = LOCUS_TYPE_TO_SO.get(locus_type) if so_id: term.append_parent(Reference.auto("SO", so_id)) else: term.append_parent(Reference.auto("SO", "0000704")) # gene unhandle_locus_types[locus_type][identifier] = term term.append_property("locus_type", locus_type) term.append_property("locus_group", locus_group) term.set_species(identifier="9606", name="H**o sapiens") for key in entry: unhandled_entry_keys[key] += 1 yield term with open(prefix_directory_join(PREFIX, name="unhandled.json"), "w") as file: json.dump( { k: {hgnc_id: term.name for hgnc_id, term in v.items()} for k, v in unhandle_locus_types.items() }, file, indent=2, ) with open(prefix_directory_join(PREFIX, name="unhandled.md"), "w") as file: for k, v in sorted(unhandle_locus_types.items()): t = tabulate( [( hgnc_id, term.name, term.is_obsolete, term.link, ", ".join(p.link for p in term.provenance if p.link), ) for hgnc_id, term in sorted(v.items())], headers=["hgnc_id", "name", "obsolete", "link", "provenance"], tablefmt="github", ) print(f"## {k} ({len(v)})", file=file) # noqa: T201 print(t, "\n", file=file) # noqa: T201 unhandle_locus_type_counter = Counter( {locus_type: len(d) for locus_type, d in unhandle_locus_types.items()}) logger.warning("Unhandled locus types:\n%s", tabulate(unhandle_locus_type_counter.most_common())) logger.warning("Unhandled keys:\n%s", tabulate(unhandled_entry_keys.most_common()))
def get_terms(force: bool = False, version: Optional[str] = None) -> Iterable[Term]: """Get RGD terms.""" df = ensure_df( PREFIX, url=GENES_URL, sep="\t", header=0, comment="#", dtype=str, force=force, version=version, quoting=3, error_bad_lines=False, ) for _, row in tqdm(df.iterrows(), total=len(df.index), desc=f"Mapping {PREFIX}", unit_scale=True): if pd.notna(row["NAME"]): definition = row["NAME"] elif pd.notna(row["GENE_DESC"]): definition = row["GENE_DESC"] else: definition = None term = Term( reference=Reference(prefix=PREFIX, identifier=row["GENE_RGD_ID"], name=row["SYMBOL"]), definition=definition, ) old_names = row["OLD_NAME"] if old_names and pd.notna(old_names): for old_name in old_names.split(";"): term.append_synonym(Synonym(name=old_name, type=old_name_type)) old_symbols = row["OLD_SYMBOL"] if old_symbols and pd.notna(old_symbols): for old_symbol in old_symbols.split(";"): term.append_synonym( Synonym(name=old_symbol, type=old_symbol_type)) for prefix, key in namespace_to_column: xref_ids = str(row[key]) if xref_ids and pd.notna(xref_ids): for xref_id in xref_ids.split(";"): if xref_id == "nan": continue if prefix == "uniprot": term.append_relationship( has_gene_product, Reference.auto(prefix=prefix, identifier=xref_id)) elif prefix == "ensembl": if xref_id.startswith("ENSMUSG") or xref_id.startswith( "ENSRNOG"): # second one is reverse strand term.append_xref( Reference(prefix=prefix, identifier=xref_id)) elif xref_id.startswith("ENSMUST"): term.append_relationship( transcribes_to, Reference(prefix=prefix, identifier=xref_id)) elif xref_id.startswith("ENSMUSP"): term.append_relationship( has_gene_product, Reference(prefix=prefix, identifier=xref_id)) else: logger.warning("[%s] unhandled xref ensembl:%s", PREFIX, xref_id) else: term.append_xref( Reference(prefix=prefix, identifier=xref_id)) pubmed_ids = row["CURATED_REF_PUBMED_ID"] if pubmed_ids and pd.notna(pubmed_ids): for pubmed_id in str(pubmed_ids).split(";"): term.append_provenance( Reference(prefix="pubmed", identifier=pubmed_id)) term.set_species(identifier="10116", name="Rattus norvegicus") yield term