def iter_terms() -> Iterable[Term]: """Iterate over terms for KEGG Genome.""" errors = 0 for kegg_genome in iter_kegg_genomes(): xrefs = [] if kegg_genome.taxonomy_id is not None: taxonomy_name = pyobo.get_name('ncbitaxon', kegg_genome.taxonomy_id) if taxonomy_name is None: errors += 1 tqdm.write(f'could not find name for taxonomy:{kegg_genome.taxonomy_id}') xrefs.append(Reference( prefix='ncbitaxon', identifier=kegg_genome.taxonomy_id, name=taxonomy_name, )) term = Term( reference=Reference( prefix='kegg.genome', identifier=kegg_genome.identifier, name=kegg_genome.name, ), xrefs=xrefs, ) yield term logger.info('[%s] unable to find %d taxonomy names in NCBI', KEGG_GENOME_PREFIX, errors)
def iter_terms(version: str, force: bool = False) -> Iterable[Term]: """Iterate over terms from GWAS Central Phenotype.""" for n in trange(1, 11000, desc=f"{PREFIX} download"): try: path = ensure_path( PREFIX, "phenotype", version=version, url= f"https://www.gwascentral.org/phenotype/HGVPM{n}?format=json", name=f"HGVPM{n}.json", force=force, ) except OSError as e: tqdm.write(f"{n}: {e}") continue with open(path) as file: j = json.load(file) description = j.get("description") if description is not None: description = description.strip().replace("\n", " ") term = Term( reference=Reference(PREFIX, j["identifier"], j["name"]), definition=description, ) yield term
def _get_term_from_tree(tree: ElementTree.ElementTree) -> Term: name = _find_text(tree, "name") description = _find_text(tree, "description") if description: description = description.strip().replace("\n", " ") identifier = _find_text(tree, "identifier") if identifier is None: raise ValueError term = Term( reference=Reference(PREFIX, identifier, name), definition=description, ) for experiment in tree.findall("experiments"): experiment_name = _find_text(experiment, "name") experiment_identifier = _find_text(experiment, "identifier") if experiment_identifier is None: continue term.append_relationship( has_part, Reference( "gwascentral.experiment", identifier=experiment_identifier, name=experiment_name, ), ) return term
def get_terms() -> Iterable[Term]: """Get ComplexPortal terms.""" df = get_df() df['aliases'] = df['aliases'].map(lambda s: s.split('|') if pd.notna(s) else []) df['members'] = df['members'].map(_parse_members) df['xrefs'] = df['xrefs'].map(_parse_xrefs) taxnomy_id_to_name = get_id_name_mapping('ncbitaxon') df['taxonomy_name'] = df['taxonomy_id'].map(taxnomy_id_to_name.get) slim_df = df[[ 'complexportal_id', 'name', 'definition', 'aliases', 'xrefs', 'taxonomy_id', 'taxonomy_name', 'members', ]] it = tqdm(slim_df.values, total=len(slim_df.index), desc=f'mapping {PREFIX}') unhandled_xref_type = set() for complexportal_id, name, definition, aliases, xrefs, taxonomy_id, taxonomy_name, members in it: synonyms = [Synonym(name=alias) for alias in aliases] _xrefs = [] provenance = [] for reference, note in xrefs: if note == 'identity': _xrefs.append(reference) elif note == 'see-also' and reference.prefix == 'pubmed': provenance.append(reference) elif (note, reference.prefix) not in unhandled_xref_type: logger.debug( f'unhandled xref type: {note} / {reference.prefix}') unhandled_xref_type.add((note, reference.prefix)) term = Term( reference=Reference(prefix=PREFIX, identifier=complexportal_id, name=name), definition=definition.strip(), synonyms=synonyms, xrefs=_xrefs, provenance=provenance, ) term.set_species(identifier=taxonomy_id, name=taxonomy_name) for reference, _count in members: term.append_relationship(has_part, reference) yield term
def get_terms(version: str, force: bool = False) -> Iterable[Term]: """Get terms.""" definitions = _get_definitions(version=version, force=force) abbr_to_taxonomy = _get_organisms(version=version, force=force) names_df = _get_names(version=version, force=force) human_orthologs = _get_human_orthologs(version=version, force=force) missing_taxonomies = set() so = {} for gtype in names_df[names_df.columns[1]].unique(): so_id = GTYPE_TO_SO.get(gtype) if so_id is None: logger.warning( "FlyBase gene type is missing mapping to Sequence Ontology (SO): %s", gtype) else: so[gtype] = Reference.auto("SO", so_id) for _, reference in sorted(so.items()): yield Term(reference=reference) for organism, gtype, identifier, symbol, name in tqdm(names_df.values): term = Term.from_triple( prefix=PREFIX, identifier=identifier, name=symbol if pd.notna(symbol) else None, definition=definitions.get(identifier), ) if gtype and pd.notna(gtype) and gtype in so: term.append_parent(so[gtype]) if pd.notna(name): term.append_synonym(name) for hgnc_curie in human_orthologs.get(identifier, []): if not hgnc_curie or pd.isna(hgnc_curie): continue hgnc_ortholog = Reference.from_curie(hgnc_curie, auto=True) if hgnc_ortholog is None: tqdm.write( f"fb:{identifier} had invalid ortholog: {hgnc_curie}") else: term.append_relationship(orthologous, hgnc_ortholog) taxonomy_id = abbr_to_taxonomy.get(organism) if taxonomy_id is not None: term.append_relationship(from_species, Reference(NCBITAXON_PREFIX, taxonomy_id)) elif organism not in missing_taxonomies: tqdm.write(f"missing mapping for species abbreviation: {organism}") missing_taxonomies.add(organism) yield term if missing_taxonomies: tqdm.write( f"there were {len(missing_taxonomies)} missing taxa in flybase genes" )
def iter_terms() -> Iterable[Term]: """Get ITIS terms.""" zip_path = ensure_path(PREFIX, url=URL) version = _get_version() sqlite_dir = prefix_directory_join(PREFIX, version=version) sqlite_path = prefix_directory_join(PREFIX, name='ITIS.sqlite', version=version) if not os.path.exists(sqlite_path): with zipfile.ZipFile(zip_path) as zip_file: for x in zip_file.filelist: if x.filename.endswith('.sqlite'): zip_file.extract(x, sqlite_dir) shutil.move( os.path.join(sqlite_dir, f'itisSqlite{version}', 'ITIS.sqlite'), sqlite_path) os.rmdir(os.path.join(sqlite_dir, f'itisSqlite{version}')) if not os.path.exists(sqlite_path): raise FileNotFoundError(f'file missing: {sqlite_path}') conn = sqlite3.connect(sqlite_path) with closing(conn.cursor()) as cursor: cursor.execute(LONGNAMES_QUERY) id_to_reference = { str(identifier): Reference(prefix=PREFIX, identifier=str(identifier), name=name) for identifier, name in cursor.fetchall() } with closing(conn.cursor()) as cursor: cursor.execute(HIERARCHY_QUERY) id_to_parents = multidict( (str(child), str(parent)) for child, parent in cursor.fetchall()) for identifier, reference in id_to_reference.items(): parents = [] for parent_identifier in id_to_parents.get(identifier, []): if parent_identifier == '0': # this means it's a plant continue parents.append(id_to_reference[parent_identifier]) term = Term( reference=reference, parents=parents, ) yield term
def get_terms(version: str, force: bool = False) -> Iterable[Term]: """Get terms.""" orthologs_df = ensure_df(PREFIX, url=ORTHOLOGS_URL, force=force, header=None, version=version) identifier_to_hgnc_ids = defaultdict(set) hgnc_symbol_to_id = pyobo.get_name_id_mapping("hgnc") for identifier, hgnc_symbols in orthologs_df.values: if hgnc_symbols == "NONE": continue for hgnc_symbol in hgnc_symbols.split("|"): hgnc_id = hgnc_symbol_to_id.get(hgnc_symbol) if hgnc_id is not None: identifier_to_hgnc_ids[identifier].add(hgnc_id) df = ensure_df(PREFIX, url=URL, force=force, header=None, version=version) so = { gtype: Reference.auto("SO", POMBASE_TO_SO[gtype]) for gtype in sorted(df[df.columns[6]].unique()) } for _, reference in sorted(so.items()): yield Term(reference=reference) for identifier, _, symbol, chromosome, name, uniprot_id, gtype, synonyms in tqdm( df.values): term = Term.from_triple( prefix=PREFIX, identifier=identifier, name=symbol if pd.notna(symbol) else None, definition=name if pd.notna(name) else None, ) term.append_property("chromosome", chromosome[len("chromosome_"):]) term.append_parent(so[gtype]) term.set_species(identifier="4896", name="Schizosaccharomyces pombe") for hgnc_id in identifier_to_hgnc_ids.get(identifier, []): term.append_relationship(orthologous, Reference.auto("hgnc", hgnc_id)) if uniprot_id and pd.notna(uniprot_id): term.append_relationship(has_gene_product, Reference.auto("uniprot", uniprot_id)) if synonyms and pd.notna(synonyms): for synonym in synonyms.split(","): term.append_synonym(Synonym(synonym)) yield term
def iter_terms(version: str) -> Iterable[Term]: """Iterate over ChEMBL compound's names.""" op = get_path(version=version) logger.info('opening connection to %s', op) with closing(sqlite3.connect(op)) as conn: logger.info('using connection %s', conn) with closing(conn.cursor()) as cursor: logger.info('using cursor %s', cursor) cursor.execute(QUERY) for chembl_id, name in cursor.fetchall(): # TODO add xrefs to smiles, inchi, inchikey here xrefs = [] yield Term( reference=Reference(prefix=PREFIX, identifier=chembl_id, name=name), xrefs=xrefs, )
def iter_terms(version: str) -> Iterable[Term]: """Iterate over DrugCentral terms.""" df = ensure_df(PREFIX, url=URL, version=version) for smiles, inchi, inchi_key, drugcentral_id, drugcentral_name, cas in df.values: if pd.isna(smiles) or pd.isna(inchi) or pd.isna(inchi_key): logger.warning("missing data for drugcentral:%s", drugcentral_id) continue xrefs = [ Reference(prefix="smiles", identifier=smiles), Reference(prefix="inchi", identifier=inchi), Reference(prefix="inchikey", identifier=inchi_key), ] if pd.notna(cas): xrefs.append(Reference(prefix="cas", identifier=cas)) yield Term( reference=Reference(prefix=PREFIX, identifier=drugcentral_id, name=drugcentral_name), xrefs=xrefs, )
def iter_terms(version: str) -> Iterable[Term]: """Iterate over terms from GWAS Central Phenotype.""" for n in trange(1, 11000, desc=f'{PREFIX} download'): try: path = ensure_path( PREFIX, 'phenotype', version=version, url=f'https://www.gwascentral.org/phenotype/HGVPM{n}?format=json', name=f'HGVPM{n}.json', ) except OSError as e: tqdm.write(f'{n}: {e}') continue with open(path) as file: j = json.load(file) term = Term( reference=Reference(PREFIX, j['identifier'], j['name']), definition=j['description'].strip().replace('\n', ' '), ) yield term
def _get_term_from_tree(tree: ElementTree.ElementTree) -> Term: name = tree.find('name').text description = tree.find('description').text if description: description = description.strip().replace('\n', ' ') identifier = tree.find('identifier').text term = Term( reference=Reference(PREFIX, identifier, name), definition=description, ) for experiment in tree.findall('experiments'): experiment_name = experiment.find('name').text experiment_id = experiment.find('identifier').text term.append_relationship( has_part, Reference( 'gwascentral.experiment', identifier=experiment_id, name=experiment_name, )) return term
def iter_terms(version: str, force: bool = False) -> Iterable[Term]: """Get ITIS terms.""" zip_path = ensure_path(PREFIX, url=URL, force=force, version=version) sqlite_dir = prefix_directory_join(PREFIX, version=version) sqlite_path = prefix_directory_join(PREFIX, name="itis.sqlite", version=version) if not os.path.exists(sqlite_path): with zipfile.ZipFile(zip_path) as zip_file: for file in zip_file.filelist: if file.filename.endswith(".sqlite") and not file.is_dir(): zip_file.extract(file, sqlite_dir) shutil.move(os.path.join(sqlite_dir, file.filename), sqlite_path) os.rmdir(os.path.join(sqlite_dir, os.path.dirname(file.filename))) if not os.path.exists(sqlite_path): raise FileNotFoundError(f"file missing: {sqlite_path}") conn = sqlite3.connect(sqlite_path.as_posix()) with closing(conn.cursor()) as cursor: cursor.execute(LONGNAMES_QUERY) id_to_reference = { str(identifier): Reference(prefix=PREFIX, identifier=str(identifier), name=name) for identifier, name in cursor.fetchall() } with closing(conn.cursor()) as cursor: cursor.execute(HIERARCHY_QUERY) id_to_parents = multidict((str(child), str(parent)) for child, parent in cursor.fetchall()) for identifier, reference in id_to_reference.items(): parents = [] for parent_identifier in id_to_parents.get(identifier, []): if parent_identifier == "0": # this means it's a plant continue parents.append(id_to_reference[parent_identifier]) term = Term( reference=reference, parents=parents, ) yield term
def iter_terms() -> Iterable[Term]: """Get ITIS terms.""" zip_path = ensure_path(PREFIX, URL) sqlite_path = prefix_directory_join(PREFIX, 'itisSqlite043020', 'ITIS.sqlite') if not os.path.exists(sqlite_path): with zipfile.ZipFile(zip_path, 'r') as zip_ref: zip_ref.extractall(get_prefix_directory(PREFIX)) if not os.path.exists(sqlite_path): raise FileNotFoundError(f'file missing: {sqlite_path}') conn = sqlite3.connect(sqlite_path) with closing(conn.cursor()) as cursor: cursor.execute(LONGNAMES_QUERY) id_to_reference = { str(identifier): Reference(prefix=PREFIX, identifier=str(identifier), name=name) for identifier, name in cursor.fetchall() } with closing(conn.cursor()) as cursor: cursor.execute(HIERARCHY_QUERY) id_to_parents = multidict( (str(child), str(parent)) for child, parent in cursor.fetchall()) for identifier, reference in id_to_reference.items(): parents = [] for parent_identifier in id_to_parents.get(identifier, []): if parent_identifier == '0': # this means its a plant continue parents.append(id_to_reference[parent_identifier]) term = Term( reference=reference, parents=parents, ) yield term
def _get_term_from_tree(tree: ElementTree.ElementTree) -> Term: name = tree.find("name").text description = tree.find("description").text if description: description = description.strip().replace("\n", " ") identifier = tree.find("identifier").text term = Term( reference=Reference(PREFIX, identifier, name), definition=description, ) for experiment in tree.findall("experiments"): experiment_name = experiment.find("name").text experiment_id = experiment.find("identifier").text term.append_relationship( has_part, Reference( "gwascentral.experiment", identifier=experiment_id, name=experiment_name, ), ) return term
def get_terms(version: str, force: bool = False) -> Iterable[Term]: """Get the FamPlex terms.""" base_url = f'https://raw.githubusercontent.com/sorgerlab/famplex/{version}' entities_url = f'{base_url}/entities.csv' entities_df = ensure_df(PREFIX, url=entities_url, version=version, dtype=str, force=force) relations_url = f'{base_url}/relations.csv' relations_df = ensure_df(PREFIX, url=relations_url, version=version, header=None, sep=',', dtype=str, force=force) definitions_url = f'{base_url}/descriptions.csv' definitions_df = ensure_df( PREFIX, url=definitions_url, version=version, header=None, sep=',', dtype=str, force=force, ) id_to_definition = { identifier: (definition, provenance) for identifier, provenance, definition in definitions_df.values } # TODO add xrefs # xrefs_url = f'https://raw.githubusercontent.com/sorgerlab/famplex/{version}/equivalences.csv' # xrefs_df = ensure_df(PREFIX, url=xrefs_url, version=version, header=None, sep=',', dtype=str) hgnc_name_to_id = get_name_id_mapping('hgnc') in_edges = defaultdict(list) out_edges = defaultdict(list) for h_ns, h_name, r, t_ns, t_name in relations_df.values: if h_ns == 'HGNC': h_identifier = hgnc_name_to_id.get(h_name) if h_identifier is None: logger.warning('[%s] could not look up HGNC identifier for gene: %s', PREFIX, h_name) h = Reference(prefix='hgnc', identifier=h_identifier, name=h_name) elif h_ns == 'FPLX': h = Reference(prefix='fplx', identifier=h_name, name=h_name) elif h_ns == 'UP': continue else: logger.exception(h_ns) raise if t_ns == 'HGNC': t_identifier = hgnc_name_to_id.get(t_name) if t_identifier is None: logger.warning('[%s] could not look up HGNC identifier for gene: %s', PREFIX, t_name) t = Reference(prefix='hgnc', identifier=t_identifier, name=t_name) elif t_ns == 'FPLX': t = Reference(prefix='fplx', identifier=t_name, name=t_name) elif h_ns == 'UP': continue else: raise out_edges[h].append((r, t)) in_edges[t].append((r, h)) for entity, in entities_df.values: reference = Reference(prefix=PREFIX, identifier=entity, name=entity) definition, provenance = id_to_definition.get(entity, (None, None)) term = Term( reference=reference, definition=definition, provenance=[Reference.from_curie(provenance)] if definition is not None else None, ) for r, t in out_edges.get(reference, []): if r == 'isa' and t.prefix == 'fplx': term.append_parent(t) elif r == 'isa': term.append_relationship(is_a, t) elif r == 'partof': term.append_relationship(part_of, t) else: logging.warning('unhandled relation %s', r) for r, h in in_edges.get(reference, []): if r == 'isa': term.append_relationship(has_member, h) elif r == 'partof': term.append_relationship(has_part, h) else: logging.warning('unhandled relation %s', r) yield term
def get_terms(force: bool = False, version: Optional[str] = None) -> Iterable[Term]: """Get RGD terms.""" df = ensure_df( PREFIX, url=GENES_URL, sep="\t", header=0, comment="#", dtype=str, force=force, version=version, quoting=3, error_bad_lines=False, ) for _, row in tqdm(df.iterrows(), total=len(df.index), desc=f"Mapping {PREFIX}", unit_scale=True): if pd.notna(row["NAME"]): definition = row["NAME"] elif pd.notna(row["GENE_DESC"]): definition = row["GENE_DESC"] else: definition = None term = Term( reference=Reference(prefix=PREFIX, identifier=row["GENE_RGD_ID"], name=row["SYMBOL"]), definition=definition, ) old_names = row["OLD_NAME"] if old_names and pd.notna(old_names): for old_name in old_names.split(";"): term.append_synonym(Synonym(name=old_name, type=old_name_type)) old_symbols = row["OLD_SYMBOL"] if old_symbols and pd.notna(old_symbols): for old_symbol in old_symbols.split(";"): term.append_synonym( Synonym(name=old_symbol, type=old_symbol_type)) for prefix, key in namespace_to_column: xref_ids = str(row[key]) if xref_ids and pd.notna(xref_ids): for xref_id in xref_ids.split(";"): if xref_id == "nan": continue if prefix == "uniprot": term.append_relationship( has_gene_product, Reference.auto(prefix=prefix, identifier=xref_id)) elif prefix == "ensembl": if xref_id.startswith("ENSMUSG") or xref_id.startswith( "ENSRNOG"): # second one is reverse strand term.append_xref( Reference(prefix=prefix, identifier=xref_id)) elif xref_id.startswith("ENSMUST"): term.append_relationship( transcribes_to, Reference(prefix=prefix, identifier=xref_id)) elif xref_id.startswith("ENSMUSP"): term.append_relationship( has_gene_product, Reference(prefix=prefix, identifier=xref_id)) else: logger.warning("[%s] unhandled xref ensembl:%s", PREFIX, xref_id) else: term.append_xref( Reference(prefix=prefix, identifier=xref_id)) pubmed_ids = row["CURATED_REF_PUBMED_ID"] if pubmed_ids and pd.notna(pubmed_ids): for pubmed_id in str(pubmed_ids).split(";"): term.append_provenance( Reference(prefix="pubmed", identifier=pubmed_id)) term.set_species(identifier="10116", name="Rattus norvegicus") yield term
def get_terms(version: str, force: bool = False) -> Iterable[Term]: """Get the FamPlex terms.""" base_url = f"https://raw.githubusercontent.com/sorgerlab/famplex/{version}" entities_url = f"{base_url}/entities.csv" entities_df = ensure_df(PREFIX, url=entities_url, version=version, dtype=str, force=force) relations_url = f"{base_url}/relations.csv" relations_df = ensure_df(PREFIX, url=relations_url, version=version, header=None, sep=",", dtype=str, force=force) definitions_url = f"{base_url}/descriptions.csv" definitions_df = ensure_df( PREFIX, url=definitions_url, version=version, header=None, sep=",", dtype=str, force=force, ) id_to_definition = { identifier: (definition, provenance) for identifier, provenance, definition in definitions_df.values } id_xrefs = _get_xref_df(version) hgnc_name_to_id = get_name_id_mapping("hgnc") in_edges = defaultdict(list) out_edges = defaultdict(list) for h_ns, h_name, r, t_ns, t_name in relations_df.values: if h_ns == "HGNC": h_identifier = hgnc_name_to_id.get(h_name) if h_identifier is None: logger.warning( "[%s] could not look up HGNC identifier for gene: %s", PREFIX, h_name) h = Reference(prefix="hgnc", identifier=h_identifier, name=h_name) elif h_ns == "FPLX": h = Reference(prefix="fplx", identifier=h_name, name=h_name) elif h_ns == "UP": continue else: logger.exception(h_ns) raise if t_ns == "HGNC": t_identifier = hgnc_name_to_id.get(t_name) if t_identifier is None: logger.warning( "[%s] could not look up HGNC identifier for gene: %s", PREFIX, t_name) t = Reference(prefix="hgnc", identifier=t_identifier, name=t_name) elif t_ns == "FPLX": t = Reference(prefix="fplx", identifier=t_name, name=t_name) elif h_ns == "UP": continue else: raise out_edges[h].append((r, t)) in_edges[t].append((r, h)) for (entity, ) in entities_df.values: reference = Reference(prefix=PREFIX, identifier=entity, name=entity) definition, provenance = id_to_definition.get(entity, (None, None)) term = Term( reference=reference, definition=definition, provenance=[Reference.from_curie(provenance)] if definition is not None else None, ) for xref_reference in id_xrefs.get(entity, []): term.append_xref(xref_reference) for r, t in out_edges.get(reference, []): if r == "isa" and t.prefix == "fplx": term.append_parent(t) elif r == "isa": term.append_relationship(is_a, t) elif r == "partof": term.append_relationship(part_of, t) else: logging.warning("unhandled relation %s", r) for r, h in in_edges.get(reference, []): if r == "isa": term.append_relationship(has_member, h) elif r == "partof": term.append_relationship(has_part, h) else: logging.warning("unhandled relation %s", r) yield term
def get_terms(version: str) -> Iterable[Term]: """Get ComplexPortal terms.""" df = get_df(version=version) df.rename( inplace=True, columns={ "Aliases for complex": "aliases", "Identifiers (and stoichiometry) of molecules in complex": "members", "Taxonomy identifier": "taxonomy_id", "Cross references": "xrefs", "Description": "definition", "Recommended name": "name", "#Complex ac": "complexportal_id", }, ) df["aliases"] = df["aliases"].map(lambda s: s.split("|") if pd.notna(s) else []) df["members"] = df["members"].map(_parse_members) df["xrefs"] = df["xrefs"].map(_parse_xrefs) taxnomy_id_to_name = get_id_name_mapping("ncbitaxon") df["taxonomy_name"] = df["taxonomy_id"].map(taxnomy_id_to_name.get) slim_df = df[ [ "complexportal_id", "name", "definition", "aliases", "xrefs", "taxonomy_id", "taxonomy_name", "members", ] ] it = tqdm(slim_df.values, total=len(slim_df.index), desc=f"mapping {PREFIX}") unhandled_xref_type = set() for ( complexportal_id, name, definition, aliases, xrefs, taxonomy_id, taxonomy_name, members, ) in it: synonyms = [Synonym(name=alias) for alias in aliases] _xrefs = [] provenance = [] for reference, note in xrefs: if note == "identity": _xrefs.append(reference) elif note == "see-also" and reference.prefix == "pubmed": provenance.append(reference) elif (note, reference.prefix) not in unhandled_xref_type: logger.debug(f"unhandled xref type: {note} / {reference.prefix}") unhandled_xref_type.add((note, reference.prefix)) term = Term( reference=Reference(prefix=PREFIX, identifier=complexportal_id, name=name), definition=definition.strip() if pd.notna(definition) else None, synonyms=synonyms, xrefs=_xrefs, provenance=provenance, ) term.set_species(identifier=taxonomy_id, name=taxonomy_name) for reference, _count in members: term.append_relationship(has_part, reference) yield term
def get_terms(force: bool = False, version: Optional[str] = None) -> Iterable[Term]: """Get terms.""" alt_ids_df = ensure_df( PREFIX, url=ALTS_URL, name="alts.tsv", force=force, header=None, names=["alt", "zfin_id"], version=version, ) primary_to_alt_ids = defaultdict(set) for alt_id, zfin_id in alt_ids_df.values: primary_to_alt_ids[zfin_id].add(alt_id) human_orthologs = multisetdict( ensure_df(PREFIX, url=HUMAN_ORTHOLOGS, force=force, header=None, usecols=[0, 7], version=version).values) mouse_orthologs = multisetdict( ensure_df(PREFIX, url=MOUSE_ORTHOLOGS, force=force, header=None, usecols=[0, 5], version=version).values) fly_orthologs = multisetdict( ensure_df(PREFIX, url=FLY_ORTHOLOGS, force=force, header=None, usecols=[0, 5], version=version).values) entrez_mappings = dict( ensure_df(PREFIX, url=ENTREZ_MAPPINGS, force=force, header=None, usecols=[0, 3], version=version).values) uniprot_mappings = multidict( ensure_df(PREFIX, url=UNIPROT_MAPPINGS, force=force, header=None, usecols=[0, 3], version=version).values) df = ensure_df( PREFIX, url=URL, name="markers.tsv", force=force, header=None, names=MARKERS_COLUMNS, version=version, ) df["sequence_ontology_id"] = df["sequence_ontology_id"].map( lambda x: x[len("SO:"):]) so = { sequence_ontology_id: Reference.auto(prefix="SO", identifier=sequence_ontology_id) for sequence_ontology_id in df["sequence_ontology_id"].unique() } for _, reference in sorted(so.items()): yield Term(reference=reference) for identifier, name, definition, _entity_type, sequence_ontology_id in tqdm( df.values): term = Term.from_triple( prefix=PREFIX, identifier=identifier, name=name, definition=definition if definition != name else None, ) term.set_species(identifier="7955", name="Danio rerio") term.append_parent(so[sequence_ontology_id]) # Entity type is redundant of identifier # term.append_property("type", entity_type) for alt_id in primary_to_alt_ids[identifier]: term.append_alt(alt_id) entrez_id = entrez_mappings.get(identifier) if entrez_id: term.append_xref(Reference("ncbigene", entrez_id)) for uniprot_id in uniprot_mappings.get(identifier, []): term.append_relationship(has_gene_product, Reference.auto("uniprot", uniprot_id)) for hgnc_id in human_orthologs.get(identifier, []): term.append_relationship(orthologous, Reference.auto("hgnc", hgnc_id)) for mgi_curie in mouse_orthologs.get(identifier, []): mouse_ortholog = Reference.from_curie(mgi_curie, auto=True) if mouse_ortholog: term.append_relationship(orthologous, mouse_ortholog) for flybase_id in fly_orthologs.get(identifier, []): term.append_relationship(orthologous, Reference("flybase", flybase_id)) yield term
def iter_terms(version: str) -> Iterable[Term]: """Iterate over terms in Rhea.""" terms = {} directions = ensure_df( PREFIX, url='ftp://ftp.expasy.org/databases/rhea/tsv/rhea-directions.tsv', version=version) for master, lr, rl, bi in directions.values: terms[master] = Term(reference=Reference(PREFIX, master)) terms[lr] = Term(reference=Reference(PREFIX, lr)) terms[rl] = Term(reference=Reference(PREFIX, rl)) terms[bi] = Term(reference=Reference(PREFIX, bi)) terms[master].append_relationship(has_lr, terms[lr]) terms[master].append_relationship(has_rl, terms[rl]) terms[master].append_relationship(has_bi, terms[bi]) terms[lr].append_parent(terms[master]) terms[rl].append_parent(terms[master]) terms[bi].append_parent(terms[master]) hierarchy = ensure_df( PREFIX, url='ftp://ftp.expasy.org/databases/rhea/tsv/rhea-relationships.tsv', version=version) for source, relation, target in hierarchy.values: if relation != 'is_a': raise ValueError(f'RHEA unrecognized relation: {relation}') terms[source].append_parent(terms[target]) for xref_prefix, url in [ ('ecocyc', 'rhea2ecocyc'), ('kegg.reaction', 'rhea2kegg_reaction'), ('reactome', 'rhea2reactome'), ('macie', 'rhea2macie'), ('metacyc', 'rhea2metacyc'), ]: xref_df = ensure_df( PREFIX, url=f'ftp://ftp.expasy.org/databases/rhea/tsv/{url}.tsv', version=version) for rhea_id, _, _, xref_id in xref_df.values: if rhea_id not in terms: logger.warning('[%s] could not find %s:%s for xref %s:%s', PREFIX, PREFIX, rhea_id, xref_prefix, xref_id) continue terms[rhea_id].append_xref(Reference(xref_prefix, xref_id)) # TODO are EC codes equivalent? # TODO uniprot enabled by (RO:0002333) # TODO names? url = 'ftp://ftp.expasy.org/databases/rhea/rdf/rhea.rdf.gz' graph = pystow.ensure_rdf('pyobo', 'raw', PREFIX, version, url=url) result = graph.query(''' PREFIX rh:<http://rdf.rhea-db.org/> SELECT ?reaction ?reactionId ?reactionLabel WHERE { ?reaction rdfs:subClassOf rh:Reaction . ?reaction rh:id ?reactionId . ?reaction rdfs:label ?reactionLabel . } ''') for _, identifier, name in result: identifier = str(identifier) if identifier not in terms: logger.warning('isolated element in rdf: rhea:%s ! %s', identifier, name) continue terms[identifier].reference.name = name # TODO participants? yield from terms.values()
def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Term]: # noqa:C901 """Get HGNC terms.""" if version is None: version = datetime.date.today().strftime("%Y-%m-01") unhandled_entry_keys: typing.Counter[str] = Counter() unhandle_locus_types: DefaultDict[str, Dict[str, Term]] = defaultdict(dict) path = ensure_path( PREFIX, url=DEFINITIONS_URL_FMT.format(version=version), force=force, version=version, name="hgnc_complete_set.json", ) with open(path) as file: entries = json.load(file)["response"]["docs"] yield from sorted( { Term(reference=Reference.auto("SO", so_id)) for so_id in sorted(LOCUS_TYPE_TO_SO.values()) if so_id }, key=attrgetter("identifier"), ) statuses = set() for entry in tqdm(entries, desc=f"Mapping {PREFIX}", unit="gene", unit_scale=True): name, symbol, identifier = ( entry.pop("name"), entry.pop("symbol"), entry.pop("hgnc_id")[len("HGNC:"):], ) status = entry.pop("status") if status == "Approved": is_obsolete = False elif status not in statuses: statuses.add(status) logger.warning("UNHANDLED %s", status) is_obsolete = True else: raise ValueError( f"Unhandled status for hgnc:{identifier}: {status}") term = Term( definition=name, reference=Reference(prefix=PREFIX, identifier=identifier, name=symbol), is_obsolete=is_obsolete, ) for uniprot_id in entry.pop("uniprot_ids", []): term.append_relationship( has_gene_product, Reference.auto("uniprot", uniprot_id), ) for ec_code in entry.pop("enzyme_id", []): if "-" in ec_code: continue # only add concrete annotations term.append_relationship( gene_product_member_of, Reference.auto("eccode", ec_code), ) for rna_central_ids in entry.pop("rna_central_id", []): for rna_central_id in rna_central_ids.split(","): term.append_relationship( transcribes_to, Reference(prefix="rnacentral", identifier=rna_central_id.strip()), ) mirbase_id = entry.pop("mirbase", None) if mirbase_id: term.append_relationship( transcribes_to, Reference.auto( "mirbase", mirbase_id, ), ) snornabase_id = entry.pop("snornabase", None) if snornabase_id: term.append_relationship( transcribes_to, Reference(prefix="snornabase", identifier=snornabase_id)) for rgd_curie in entry.pop("rgd_id", []): if not rgd_curie.startswith("RGD:"): logger.warning( f"hgnc:{identifier} had bad RGD CURIE: {rgd_curie}") continue rgd_id = rgd_curie[len("RGD:"):] term.append_relationship( orthologous, Reference.auto(prefix="rgd", identifier=rgd_id), ) for mgi_curie in entry.pop("mgd_id", []): if not mgi_curie.startswith("MGI:"): logger.warning( f"hgnc:{identifier} had bad MGI CURIE: {mgi_curie}") continue mgi_id = mgi_curie[len("MGI:"):] if not mgi_id: continue term.append_relationship( orthologous, Reference.auto(prefix="mgi", identifier=mgi_id), ) for xref_prefix, key in gene_xrefs: xref_identifiers = entry.pop(key, None) if xref_identifiers is None: continue if not isinstance(xref_identifiers, list): xref_identifiers = [xref_identifiers] for xref_identifier in xref_identifiers: term.append_xref( Reference(prefix=xref_prefix, identifier=str(xref_identifier))) for pubmed_id in entry.pop("pubmed_id", []): term.append_provenance( Reference(prefix="pubmed", identifier=str(pubmed_id))) gene_group_ids = entry.pop("gene_group_id", []) gene_groups = entry.pop("gene_group", []) for gene_group_id, gene_group_label in zip(gene_group_ids, gene_groups): term.append_relationship( member_of, Reference( prefix="hgnc.genegroup", identifier=str(gene_group_id), name=gene_group_label, ), ) for alias_symbol in entry.pop("alias_symbol", []): term.append_synonym( Synonym(name=alias_symbol, type=alias_symbol_type)) for alias_name in entry.pop("alias_name", []): term.append_synonym(Synonym(name=alias_name, type=alias_name_type)) for previous_symbol in entry.pop("previous_symbol", []): term.append_synonym( Synonym(name=previous_symbol, type=previous_symbol_type)) for previous_name in entry.pop("prev_name", []): term.append_synonym( Synonym(name=previous_name, type=previous_name_type)) for prop in ["location"]: value = entry.pop(prop, None) if value: term.append_property(prop, value) locus_type = entry.pop("locus_type") locus_group = entry.pop("locus_group") so_id = LOCUS_TYPE_TO_SO.get(locus_type) if so_id: term.append_parent(Reference.auto("SO", so_id)) else: term.append_parent(Reference.auto("SO", "0000704")) # gene unhandle_locus_types[locus_type][identifier] = term term.append_property("locus_type", locus_type) term.append_property("locus_group", locus_group) term.set_species(identifier="9606", name="H**o sapiens") for key in entry: unhandled_entry_keys[key] += 1 yield term with open(prefix_directory_join(PREFIX, name="unhandled.json"), "w") as file: json.dump( { k: {hgnc_id: term.name for hgnc_id, term in v.items()} for k, v in unhandle_locus_types.items() }, file, indent=2, ) with open(prefix_directory_join(PREFIX, name="unhandled.md"), "w") as file: for k, v in sorted(unhandle_locus_types.items()): t = tabulate( [( hgnc_id, term.name, term.is_obsolete, term.link, ", ".join(p.link for p in term.provenance if p.link), ) for hgnc_id, term in sorted(v.items())], headers=["hgnc_id", "name", "obsolete", "link", "provenance"], tablefmt="github", ) print(f"## {k} ({len(v)})", file=file) # noqa: T201 print(t, "\n", file=file) # noqa: T201 unhandle_locus_type_counter = Counter( {locus_type: len(d) for locus_type, d in unhandle_locus_types.items()}) logger.warning("Unhandled locus types:\n%s", tabulate(unhandle_locus_type_counter.most_common())) logger.warning("Unhandled keys:\n%s", tabulate(unhandled_entry_keys.most_common()))