def get_terms(force: bool = False) -> Iterable[Term]: """Get terms.""" # DDB ID DDB_G ID Name UniProt ID uniprot_mappings = multisetdict( ensure_df(PREFIX, url=URL, force=force, name="uniprot_mappings.tsv", usecols=[1, 3]).values ) terms = ensure_df(PREFIX, url=URL, force=force, name="gene_info.tsv") # GENE ID (DDB_G ID) Gene Name Synonyms Gene products for identifier, name, synonyms, products in tqdm(terms.values): term = Term.from_triple( prefix=PREFIX, identifier=identifier, name=name, ) if products and pd.notna(products) and products != "unknown": for synonym in products.split(","): term.append_synonym(synonym.strip()) if synonyms and pd.notna(synonyms): for synonym in synonyms.split(","): term.append_synonym(Synonym(synonym.strip())) for uniprot_id in uniprot_mappings.get(identifier, []): if not uniprot_id or pd.isna(uniprot_id) or uniprot_id not in {"unknown", "pseudogene"}: continue term.append_relationship(has_gene_product, Reference.auto("uniprot", uniprot_id)) term.set_species(identifier="44689", name="Dictyostelium discoideum") yield term
def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Term]: # noqa:C901 """Get HGNC terms.""" if version is None: version = datetime.date.today().strftime("%Y-%m-01") unhandled_entry_keys: typing.Counter[str] = Counter() unhandle_locus_types: DefaultDict[str, Dict[str, Term]] = defaultdict(dict) path = ensure_path( PREFIX, url=DEFINITIONS_URL_FMT.format(version=version), force=force, version=version, name="hgnc_complete_set.json", ) with open(path) as file: entries = json.load(file)["response"]["docs"] yield from sorted( { Term(reference=Reference.auto("SO", so_id)) for so_id in sorted(LOCUS_TYPE_TO_SO.values()) if so_id }, key=attrgetter("identifier"), ) statuses = set() for entry in tqdm(entries, desc=f"Mapping {PREFIX}", unit="gene", unit_scale=True): name, symbol, identifier = ( entry.pop("name"), entry.pop("symbol"), entry.pop("hgnc_id")[len("HGNC:"):], ) status = entry.pop("status") if status == "Approved": is_obsolete = False elif status not in statuses: statuses.add(status) logger.warning("UNHANDLED %s", status) is_obsolete = True else: raise ValueError( f"Unhandled status for hgnc:{identifier}: {status}") term = Term( definition=name, reference=Reference(prefix=PREFIX, identifier=identifier, name=symbol), is_obsolete=is_obsolete, ) for uniprot_id in entry.pop("uniprot_ids", []): term.append_relationship( has_gene_product, Reference.auto("uniprot", uniprot_id), ) for ec_code in entry.pop("enzyme_id", []): if "-" in ec_code: continue # only add concrete annotations term.append_relationship( gene_product_member_of, Reference.auto("eccode", ec_code), ) for rna_central_ids in entry.pop("rna_central_id", []): for rna_central_id in rna_central_ids.split(","): term.append_relationship( transcribes_to, Reference(prefix="rnacentral", identifier=rna_central_id.strip()), ) mirbase_id = entry.pop("mirbase", None) if mirbase_id: term.append_relationship( transcribes_to, Reference.auto( "mirbase", mirbase_id, ), ) snornabase_id = entry.pop("snornabase", None) if snornabase_id: term.append_relationship( transcribes_to, Reference(prefix="snornabase", identifier=snornabase_id)) for rgd_curie in entry.pop("rgd_id", []): if not rgd_curie.startswith("RGD:"): logger.warning( f"hgnc:{identifier} had bad RGD CURIE: {rgd_curie}") continue rgd_id = rgd_curie[len("RGD:"):] term.append_relationship( orthologous, Reference.auto(prefix="rgd", identifier=rgd_id), ) for mgi_curie in entry.pop("mgd_id", []): if not mgi_curie.startswith("MGI:"): logger.warning( f"hgnc:{identifier} had bad MGI CURIE: {mgi_curie}") continue mgi_id = mgi_curie[len("MGI:"):] if not mgi_id: continue term.append_relationship( orthologous, Reference.auto(prefix="mgi", identifier=mgi_id), ) for xref_prefix, key in gene_xrefs: xref_identifiers = entry.pop(key, None) if xref_identifiers is None: continue if not isinstance(xref_identifiers, list): xref_identifiers = [xref_identifiers] for xref_identifier in xref_identifiers: term.append_xref( Reference(prefix=xref_prefix, identifier=str(xref_identifier))) for pubmed_id in entry.pop("pubmed_id", []): term.append_provenance( Reference(prefix="pubmed", identifier=str(pubmed_id))) gene_group_ids = entry.pop("gene_group_id", []) gene_groups = entry.pop("gene_group", []) for gene_group_id, gene_group_label in zip(gene_group_ids, gene_groups): term.append_relationship( member_of, Reference( prefix="hgnc.genegroup", identifier=str(gene_group_id), name=gene_group_label, ), ) for alias_symbol in entry.pop("alias_symbol", []): term.append_synonym( Synonym(name=alias_symbol, type=alias_symbol_type)) for alias_name in entry.pop("alias_name", []): term.append_synonym(Synonym(name=alias_name, type=alias_name_type)) for previous_symbol in entry.pop("previous_symbol", []): term.append_synonym( Synonym(name=previous_symbol, type=previous_symbol_type)) for previous_name in entry.pop("prev_name", []): term.append_synonym( Synonym(name=previous_name, type=previous_name_type)) for prop in ["location"]: value = entry.pop(prop, None) if value: term.append_property(prop, value) locus_type = entry.pop("locus_type") locus_group = entry.pop("locus_group") so_id = LOCUS_TYPE_TO_SO.get(locus_type) if so_id: term.append_parent(Reference.auto("SO", so_id)) else: term.append_parent(Reference.auto("SO", "0000704")) # gene unhandle_locus_types[locus_type][identifier] = term term.append_property("locus_type", locus_type) term.append_property("locus_group", locus_group) term.set_species(identifier="9606", name="H**o sapiens") for key in entry: unhandled_entry_keys[key] += 1 yield term with open(prefix_directory_join(PREFIX, name="unhandled.json"), "w") as file: json.dump( { k: {hgnc_id: term.name for hgnc_id, term in v.items()} for k, v in unhandle_locus_types.items() }, file, indent=2, ) with open(prefix_directory_join(PREFIX, name="unhandled.md"), "w") as file: for k, v in sorted(unhandle_locus_types.items()): t = tabulate( [( hgnc_id, term.name, term.is_obsolete, term.link, ", ".join(p.link for p in term.provenance if p.link), ) for hgnc_id, term in sorted(v.items())], headers=["hgnc_id", "name", "obsolete", "link", "provenance"], tablefmt="github", ) print(f"## {k} ({len(v)})", file=file) # noqa: T201 print(t, "\n", file=file) # noqa: T201 unhandle_locus_type_counter = Counter( {locus_type: len(d) for locus_type, d in unhandle_locus_types.items()}) logger.warning("Unhandled locus types:\n%s", tabulate(unhandle_locus_type_counter.most_common())) logger.warning("Unhandled keys:\n%s", tabulate(unhandled_entry_keys.most_common()))
def get_terms(force: bool = False, version: Optional[str] = None) -> Iterable[Term]: """Get terms.""" alt_ids_df = ensure_df( PREFIX, url=ALTS_URL, name="alts.tsv", force=force, header=None, names=["alt", "zfin_id"], version=version, ) primary_to_alt_ids = defaultdict(set) for alt_id, zfin_id in alt_ids_df.values: primary_to_alt_ids[zfin_id].add(alt_id) human_orthologs = multisetdict( ensure_df(PREFIX, url=HUMAN_ORTHOLOGS, force=force, header=None, usecols=[0, 7], version=version).values) mouse_orthologs = multisetdict( ensure_df(PREFIX, url=MOUSE_ORTHOLOGS, force=force, header=None, usecols=[0, 5], version=version).values) fly_orthologs = multisetdict( ensure_df(PREFIX, url=FLY_ORTHOLOGS, force=force, header=None, usecols=[0, 5], version=version).values) entrez_mappings = dict( ensure_df(PREFIX, url=ENTREZ_MAPPINGS, force=force, header=None, usecols=[0, 3], version=version).values) uniprot_mappings = multidict( ensure_df(PREFIX, url=UNIPROT_MAPPINGS, force=force, header=None, usecols=[0, 3], version=version).values) df = ensure_df( PREFIX, url=URL, name="markers.tsv", force=force, header=None, names=MARKERS_COLUMNS, version=version, ) df["sequence_ontology_id"] = df["sequence_ontology_id"].map( lambda x: x[len("SO:"):]) so = { sequence_ontology_id: Reference.auto(prefix="SO", identifier=sequence_ontology_id) for sequence_ontology_id in df["sequence_ontology_id"].unique() } for _, reference in sorted(so.items()): yield Term(reference=reference) for identifier, name, definition, _entity_type, sequence_ontology_id in tqdm( df.values): term = Term.from_triple( prefix=PREFIX, identifier=identifier, name=name, definition=definition if definition != name else None, ) term.set_species(identifier="7955", name="Danio rerio") term.append_parent(so[sequence_ontology_id]) # Entity type is redundant of identifier # term.append_property("type", entity_type) for alt_id in primary_to_alt_ids[identifier]: term.append_alt(alt_id) entrez_id = entrez_mappings.get(identifier) if entrez_id: term.append_xref(Reference("ncbigene", entrez_id)) for uniprot_id in uniprot_mappings.get(identifier, []): term.append_relationship(has_gene_product, Reference.auto("uniprot", uniprot_id)) for hgnc_id in human_orthologs.get(identifier, []): term.append_relationship(orthologous, Reference.auto("hgnc", hgnc_id)) for mgi_curie in mouse_orthologs.get(identifier, []): mouse_ortholog = Reference.from_curie(mgi_curie, auto=True) if mouse_ortholog: term.append_relationship(orthologous, mouse_ortholog) for flybase_id in fly_orthologs.get(identifier, []): term.append_relationship(orthologous, Reference("flybase", flybase_id)) yield term
def get_terms(force: bool = False, version: Optional[str] = None) -> Iterable[Term]: """Get RGD terms.""" df = ensure_df( PREFIX, url=GENES_URL, sep="\t", header=0, comment="#", dtype=str, force=force, version=version, quoting=3, error_bad_lines=False, ) for _, row in tqdm(df.iterrows(), total=len(df.index), desc=f"Mapping {PREFIX}", unit_scale=True): if pd.notna(row["NAME"]): definition = row["NAME"] elif pd.notna(row["GENE_DESC"]): definition = row["GENE_DESC"] else: definition = None term = Term( reference=Reference(prefix=PREFIX, identifier=row["GENE_RGD_ID"], name=row["SYMBOL"]), definition=definition, ) old_names = row["OLD_NAME"] if old_names and pd.notna(old_names): for old_name in old_names.split(";"): term.append_synonym(Synonym(name=old_name, type=old_name_type)) old_symbols = row["OLD_SYMBOL"] if old_symbols and pd.notna(old_symbols): for old_symbol in old_symbols.split(";"): term.append_synonym( Synonym(name=old_symbol, type=old_symbol_type)) for prefix, key in namespace_to_column: xref_ids = str(row[key]) if xref_ids and pd.notna(xref_ids): for xref_id in xref_ids.split(";"): if xref_id == "nan": continue if prefix == "uniprot": term.append_relationship( has_gene_product, Reference.auto(prefix=prefix, identifier=xref_id)) elif prefix == "ensembl": if xref_id.startswith("ENSMUSG") or xref_id.startswith( "ENSRNOG"): # second one is reverse strand term.append_xref( Reference(prefix=prefix, identifier=xref_id)) elif xref_id.startswith("ENSMUST"): term.append_relationship( transcribes_to, Reference(prefix=prefix, identifier=xref_id)) elif xref_id.startswith("ENSMUSP"): term.append_relationship( has_gene_product, Reference(prefix=prefix, identifier=xref_id)) else: logger.warning("[%s] unhandled xref ensembl:%s", PREFIX, xref_id) else: term.append_xref( Reference(prefix=prefix, identifier=xref_id)) pubmed_ids = row["CURATED_REF_PUBMED_ID"] if pubmed_ids and pd.notna(pubmed_ids): for pubmed_id in str(pubmed_ids).split(";"): term.append_provenance( Reference(prefix="pubmed", identifier=pubmed_id)) term.set_species(identifier="10116", name="Rattus norvegicus") yield term