def _get_term_from_tree(tree: ElementTree.ElementTree) -> Term: name = _find_text(tree, "name") description = _find_text(tree, "description") if description: description = description.strip().replace("\n", " ") identifier = _find_text(tree, "identifier") if identifier is None: raise ValueError term = Term( reference=Reference(PREFIX, identifier, name), definition=description, ) for experiment in tree.findall("experiments"): experiment_name = _find_text(experiment, "name") experiment_identifier = _find_text(experiment, "identifier") if experiment_identifier is None: continue term.append_relationship( has_part, Reference( "gwascentral.experiment", identifier=experiment_identifier, name=experiment_name, ), ) return term
def iter_terms() -> Iterable[Term]: """Iterate over terms for KEGG Genome.""" errors = 0 for kegg_genome in iter_kegg_genomes(): xrefs = [] if kegg_genome.taxonomy_id is not None: taxonomy_name = pyobo.get_name('ncbitaxon', kegg_genome.taxonomy_id) if taxonomy_name is None: errors += 1 tqdm.write(f'could not find name for taxonomy:{kegg_genome.taxonomy_id}') xrefs.append(Reference( prefix='ncbitaxon', identifier=kegg_genome.taxonomy_id, name=taxonomy_name, )) term = Term( reference=Reference( prefix='kegg.genome', identifier=kegg_genome.identifier, name=kegg_genome.name, ), xrefs=xrefs, ) yield term logger.info('[%s] unable to find %d taxonomy names in NCBI', KEGG_GENOME_PREFIX, errors)
def _iter_genome_terms( *, list_pathway_path: str, link_pathway_path: str, kegg_genome: KEGGGenome, ) -> Iterable[Term]: terms = {} with open(list_pathway_path) as file: list_pathway_lines = [line.strip() for line in file] for line in list_pathway_lines: line = line.strip() pathway_id, name = [part.strip() for part in line.split("\t")] pathway_id = pathway_id[len("path:"):] terms[pathway_id] = term = Term.from_triple( prefix=KEGG_PATHWAY_PREFIX, identifier=pathway_id, name=name, ) # Annotate species information kegg_genome.annotate_term(term) # Annotate the non-species specific code _start = min(i for i, e in enumerate(pathway_id) if e.isnumeric()) pathway_code = pathway_id[_start:] term.append_relationship( species_specific, Reference(prefix=KEGG_PATHWAY_PREFIX, identifier=f"map{pathway_code}"), ) for pathway_id, protein_ids in _get_link_pathway_map( link_pathway_path).items(): term = terms.get(pathway_id) if term is None: tqdm.write( f"could not find kegg.pathway:{pathway_id} for {kegg_genome.name}" ) continue for protein_id in protein_ids: term.append_relationship( has_part, Reference( prefix=KEGG_GENES_PREFIX, identifier=protein_id, ), ) yield from terms.values()
def get_terms(force: bool = False) -> Iterable[Term]: """Get terms.""" # DDB ID DDB_G ID Name UniProt ID uniprot_mappings = multisetdict( ensure_df(PREFIX, url=URL, force=force, name="uniprot_mappings.tsv", usecols=[1, 3]).values ) terms = ensure_df(PREFIX, url=URL, force=force, name="gene_info.tsv") # GENE ID (DDB_G ID) Gene Name Synonyms Gene products for identifier, name, synonyms, products in tqdm(terms.values): term = Term.from_triple( prefix=PREFIX, identifier=identifier, name=name, ) if products and pd.notna(products) and products != "unknown": for synonym in products.split(","): term.append_synonym(synonym.strip()) if synonyms and pd.notna(synonyms): for synonym in synonyms.split(","): term.append_synonym(Synonym(synonym.strip())) for uniprot_id in uniprot_mappings.get(identifier, []): if not uniprot_id or pd.isna(uniprot_id) or uniprot_id not in {"unknown", "pseudogene"}: continue term.append_relationship(has_gene_product, Reference.auto("uniprot", uniprot_id)) term.set_species(identifier="44689", name="Dictyostelium discoideum") yield term
def _get_xref_df(version: str) -> Mapping[str, List[Reference]]: base_url = f"https://raw.githubusercontent.com/sorgerlab/famplex/{version}" xrefs_url = f"{base_url}/equivalences.csv" xrefs_df = ensure_df(PREFIX, url=xrefs_url, version=version, header=None, sep=",", dtype=str) # Normalize nextprot families ns_remapping = { "NXP": "nextprot.family", } xrefs_df[0] = xrefs_df[0].map(lambda s: ns_remapping.get(s, s)) xrefs_df[1] = [ xref_identifier if xref_prefix != "nextprot.family" else xref_identifier[len("FA:"):] for xref_prefix, xref_identifier in xrefs_df[[0, 1]].values ] xrefs_df[0] = xrefs_df[0].map(normalize_prefix) xrefs_df = xrefs_df[xrefs_df[0].notna()] xrefs_df = xrefs_df[xrefs_df[0] != "bel"] return multidict( (identifier, Reference(xref_prefix, xref_identifier)) for xref_prefix, xref_identifier, identifier in xrefs_df.values)
def iter_terms(version: str) -> Iterable[Term]: """Iterate over terms for KEGG Genome.""" errors = 0 for kegg_genome in iter_kegg_genomes(version=version, desc="KEGG Genomes"): if kegg_genome.identifier in SKIP: continue term = Term.from_triple( prefix=KEGG_GENOME_PREFIX, identifier=kegg_genome.identifier, name=kegg_genome.name, ) if kegg_genome.taxonomy_id is not None: taxonomy_name = pyobo.get_name("ncbitaxon", kegg_genome.taxonomy_id) if taxonomy_name is None: errors += 1 logger.debug( f"[{KEGG_GENOME_PREFIX}] could not find name for taxonomy:{kegg_genome.taxonomy_id}" ) term.append_xref( Reference( prefix="ncbitaxon", identifier=kegg_genome.taxonomy_id, name=taxonomy_name, )) yield term logger.info("[%s] unable to find %d taxonomy names in NCBI", KEGG_GENOME_PREFIX, errors)
def iter_terms(version: str, force: bool = False) -> Iterable[Term]: """Iterate over terms from GWAS Central Phenotype.""" for n in trange(1, 11000, desc=f"{PREFIX} download"): try: path = ensure_path( PREFIX, "phenotype", version=version, url= f"https://www.gwascentral.org/phenotype/HGVPM{n}?format=json", name=f"HGVPM{n}.json", force=force, ) except OSError as e: tqdm.write(f"{n}: {e}") continue with open(path) as file: j = json.load(file) description = j.get("description") if description is not None: description = description.strip().replace("\n", " ") term = Term( reference=Reference(PREFIX, j["identifier"], j["name"]), definition=description, ) yield term
def iter_terms(version: str, force: bool = False) -> Iterable[Term]: """Iterate over DrugCentral terms.""" url = f"https://unmtid-shinyapps.net/download/DrugCentral/{version}/structures.smiles.tsv" df = ensure_df(PREFIX, url=url, version=version, force=force) for smiles, inchi, inchi_key, drugcentral_id, drugcentral_name, cas in df.values: if pd.isna(smiles) or pd.isna(inchi) or pd.isna(inchi_key): logger.warning("missing data for drugcentral:%s", drugcentral_id) continue term = Term.from_triple(prefix=PREFIX, identifier=drugcentral_id, name=drugcentral_name) term.append_xref(Reference(prefix="inchikey", identifier=inchi_key)) term.append_property("smiles", smiles) term.append_property("inchi", inchi) if pd.notna(cas): term.append_xref(Reference(prefix="cas", identifier=cas)) yield term
def test_extract_definition(self): """Test extracting a definition.""" expected_text = "Test Text." for s, expected_references in [ (f'"{expected_text}"', []), (f'"{expected_text}" []', []), (f'"{expected_text}" [PMID:1234]', [Reference('pubmed', '1234')]), (f'"{expected_text}" [PMID:1234, PMID:1235]', [Reference('pubmed', '1234'), Reference('pubmed', '1235')]), ]: with self.subTest(s=s): actual_text, actual_references = _extract_definition( s, prefix='chebi', identifier='XXX') self.assertEqual(expected_text, actual_text) self.assertEqual(expected_references, actual_references)
def test_extract_definition_with_escapes(self): """Test extracting a definition with escapes in it.""" expected_text = '''The canonical 3' splice site has the sequence "AG".''' s = '''"The canonical 3' splice site has the sequence \\"AG\\"." [PMID:1234]''' actual_text, actual_references = _extract_definition(s, strict=True, prefix='chebi', identifier='XXX') self.assertEqual(expected_text, actual_text) self.assertEqual([Reference('pubmed', '1234')], actual_references)
def test_extract_definition_with_escapes(self): """Test extracting a definition with escapes in it.""" expected_text = """The canonical 3' splice site has the sequence "AG".""" s = """"The canonical 3' splice site has the sequence \\"AG\\"." [PMID:1234]""" actual_text, actual_references = _extract_definition(s, strict=True, prefix="chebi", identifier="XXX") self.assertEqual(expected_text, actual_text) self.assertEqual([Reference("pubmed", "1234")], actual_references)
def test_extract_definition(self): """Test extracting a definition.""" expected_text = "Test Text." for s, expected_references in [ (f'"{expected_text}"', []), (f'"{expected_text}" []', []), (f'"{expected_text}" [PMID:1234]', [Reference("pubmed", "1234")]), ( f'"{expected_text}" [PMID:1234, PMID:1235]', [Reference("pubmed", "1234"), Reference("pubmed", "1235")], ), ]: with self.subTest(s=s): actual_text, actual_references = _extract_definition( s, prefix="chebi", identifier="XXX") self.assertEqual(expected_text, actual_text) self.assertEqual(expected_references, actual_references)
def get_terms() -> Iterable[Term]: """Get ComplexPortal terms.""" df = get_df() df['aliases'] = df['aliases'].map(lambda s: s.split('|') if pd.notna(s) else []) df['members'] = df['members'].map(_parse_members) df['xrefs'] = df['xrefs'].map(_parse_xrefs) taxnomy_id_to_name = get_id_name_mapping('ncbitaxon') df['taxonomy_name'] = df['taxonomy_id'].map(taxnomy_id_to_name.get) slim_df = df[[ 'complexportal_id', 'name', 'definition', 'aliases', 'xrefs', 'taxonomy_id', 'taxonomy_name', 'members', ]] it = tqdm(slim_df.values, total=len(slim_df.index), desc=f'mapping {PREFIX}') unhandled_xref_type = set() for complexportal_id, name, definition, aliases, xrefs, taxonomy_id, taxonomy_name, members in it: synonyms = [Synonym(name=alias) for alias in aliases] _xrefs = [] provenance = [] for reference, note in xrefs: if note == 'identity': _xrefs.append(reference) elif note == 'see-also' and reference.prefix == 'pubmed': provenance.append(reference) elif (note, reference.prefix) not in unhandled_xref_type: logger.debug( f'unhandled xref type: {note} / {reference.prefix}') unhandled_xref_type.add((note, reference.prefix)) term = Term( reference=Reference(prefix=PREFIX, identifier=complexportal_id, name=name), definition=definition.strip(), synonyms=synonyms, xrefs=_xrefs, provenance=provenance, ) term.set_species(identifier=taxonomy_id, name=taxonomy_name) for reference, _count in members: term.append_relationship(has_part, reference) yield term
def test_extract_synonym(self): """Test extracting synonym strings.""" iupac_name = SynonymTypeDef(id="IUPAC_NAME", name="IUPAC NAME") synoynym_typedefs = { "IUPAC_NAME": iupac_name, } for synonym, s in [ ( Synonym( name="LTEC I", specificity="EXACT", type=iupac_name, provenance=[Reference("orphanet", "93938")], ), '"LTEC I" EXACT IUPAC_NAME [Orphanet:93938]', ), ( Synonym(name="LTEC I", specificity="EXACT", provenance=[Reference("orphanet", "93938")]), '"LTEC I" EXACT [Orphanet:93938]', ), ( Synonym(name="LTEC I", specificity="EXACT", provenance=[Reference("orphanet", "93938")]), '"LTEC I" [Orphanet:93938]', ), ( Synonym(name="LTEC I", specificity="EXACT"), '"LTEC I" []', ), ]: with self.subTest(s=s): self.assertEqual( synonym, _extract_synonym(s, synoynym_typedefs, prefix="chebi", identifier="XXX"), )
def _parse_xrefs(s) -> List[Tuple[Reference, str]]: if pd.isna(s): return [] rv = [] for xref in s.split('|'): entity_id, note = xref.split('(') note = note.rstrip(')') prefix, identifier = entity_id.split(':', 1) rv.append((Reference(prefix=prefix, identifier=identifier), note)) return rv
def iter_terms(version: str) -> Iterable[Term]: """Iterate over DrugCentral terms.""" df = ensure_df(PREFIX, url=URL, version=version) for smiles, inchi, inchi_key, drugcentral_id, drugcentral_name, cas in df.values: if pd.isna(smiles) or pd.isna(inchi) or pd.isna(inchi_key): logger.warning("missing data for drugcentral:%s", drugcentral_id) continue xrefs = [ Reference(prefix="smiles", identifier=smiles), Reference(prefix="inchi", identifier=inchi), Reference(prefix="inchikey", identifier=inchi_key), ] if pd.notna(cas): xrefs.append(Reference(prefix="cas", identifier=cas)) yield Term( reference=Reference(prefix=PREFIX, identifier=drugcentral_id, name=drugcentral_name), xrefs=xrefs, )
def test_extract_synonym(self): """Test extracting synonym strings.""" iupac_name = SynonymTypeDef(id='IUPAC_NAME', name='IUPAC NAME') synoynym_typedefs = { 'IUPAC_NAME': iupac_name, } for synonym, s in [ ( Synonym( name='LTEC I', specificity='EXACT', type=iupac_name, provenance=[Reference('orphanet', '93938')], ), '"LTEC I" EXACT IUPAC_NAME [Orphanet:93938]', ), ( Synonym(name='LTEC I', specificity='EXACT', provenance=[Reference('orphanet', '93938')]), '"LTEC I" EXACT [Orphanet:93938]', ), ( Synonym(name='LTEC I', specificity='EXACT', provenance=[Reference('orphanet', '93938')]), '"LTEC I" [Orphanet:93938]', ), ( Synonym(name='LTEC I', specificity='EXACT'), '"LTEC I" []', ), ]: with self.subTest(s=s): self.assertEqual( synonym, _extract_synonym(s, synoynym_typedefs, prefix='chebi', identifier='XXX'))
def _get_term_from_tree(tree: ElementTree.ElementTree) -> Term: name = tree.find('name').text description = tree.find('description').text if description: description = description.strip().replace('\n', ' ') identifier = tree.find('identifier').text term = Term( reference=Reference(PREFIX, identifier, name), definition=description, ) for experiment in tree.findall('experiments'): experiment_name = experiment.find('name').text experiment_id = experiment.find('identifier').text term.append_relationship( has_part, Reference( 'gwascentral.experiment', identifier=experiment_id, name=experiment_name, )) return term
def _get_term_from_tree(tree: ElementTree.ElementTree) -> Term: name = tree.find("name").text description = tree.find("description").text if description: description = description.strip().replace("\n", " ") identifier = tree.find("identifier").text term = Term( reference=Reference(PREFIX, identifier, name), definition=description, ) for experiment in tree.findall("experiments"): experiment_name = experiment.find("name").text experiment_id = experiment.find("identifier").text term.append_relationship( has_part, Reference( "gwascentral.experiment", identifier=experiment_id, name=experiment_name, ), ) return term
def _parse_members(s) -> List[Tuple[Reference, str]]: if pd.isna(s): return [] rv = [] for member in s.split('|'): entity_id, count = member.split('(') count = count.rstrip(')') if ':' in entity_id: prefix, identifier = entity_id.split(':', 1) else: prefix, identifier = 'uniprot', entity_id rv.append((Reference(prefix=prefix, identifier=identifier), count)) return rv
def _parse_members(s) -> List[Tuple[Reference, str]]: if pd.isna(s): return [] rv = [] for member in s.split("|"): entity_id, count = member.split("(") count = count.rstrip(")") if ":" in entity_id: prefix, identifier = entity_id.split(":", 1) else: prefix, identifier = "uniprot", entity_id rv.append((Reference(prefix=prefix, identifier=identifier), count)) return rv
def get_terms(force: bool = False) -> Iterable[Term]: """Get CGNC terms.""" df = ensure_df(PREFIX, url=URL, name=f"{PREFIX}.tsv", force=force, header=0, names=HEADER) for i, (cgnc_id, entrez_id, ensembl_id, name, synonym_1, synoynm_2, _, _) in enumerate(df.values): if pd.isna(cgnc_id): logger.warning(f"row {i} CGNC ID is none") continue try: int(cgnc_id) except ValueError: logger.warning(f"row {i} CGNC ID is not int-like: {cgnc_id}") continue term = Term.from_triple( prefix=PREFIX, identifier=cgnc_id, name=name, ) term.set_species(identifier="9031", name="Gallus gallus") if entrez_id and pd.notna(entrez_id): term.append_xref(Reference(prefix="ncbigene", identifier=entrez_id)) if pd.notna(ensembl_id): term.append_xref(Reference(prefix="ensembl", identifier=ensembl_id)) if synonym_1 and pd.notna(synonym_1): term.append_synonym(synonym_1) if synoynm_2 and pd.notna(synoynm_2): term.append_synonym(synoynm_2) yield term
def iter_terms() -> Iterable[Term]: """Get ITIS terms.""" zip_path = ensure_path(PREFIX, url=URL) version = _get_version() sqlite_dir = prefix_directory_join(PREFIX, version=version) sqlite_path = prefix_directory_join(PREFIX, name='ITIS.sqlite', version=version) if not os.path.exists(sqlite_path): with zipfile.ZipFile(zip_path) as zip_file: for x in zip_file.filelist: if x.filename.endswith('.sqlite'): zip_file.extract(x, sqlite_dir) shutil.move( os.path.join(sqlite_dir, f'itisSqlite{version}', 'ITIS.sqlite'), sqlite_path) os.rmdir(os.path.join(sqlite_dir, f'itisSqlite{version}')) if not os.path.exists(sqlite_path): raise FileNotFoundError(f'file missing: {sqlite_path}') conn = sqlite3.connect(sqlite_path) with closing(conn.cursor()) as cursor: cursor.execute(LONGNAMES_QUERY) id_to_reference = { str(identifier): Reference(prefix=PREFIX, identifier=str(identifier), name=name) for identifier, name in cursor.fetchall() } with closing(conn.cursor()) as cursor: cursor.execute(HIERARCHY_QUERY) id_to_parents = multidict( (str(child), str(parent)) for child, parent in cursor.fetchall()) for identifier, reference in id_to_reference.items(): parents = [] for parent_identifier in id_to_parents.get(identifier, []): if parent_identifier == '0': # this means it's a plant continue parents.append(id_to_reference[parent_identifier]) term = Term( reference=reference, parents=parents, ) yield term
def iter_terms(version: str) -> Iterable[Term]: """Iterate over ChEMBL compound's names.""" op = get_path(version=version) logger.info('opening connection to %s', op) with closing(sqlite3.connect(op)) as conn: logger.info('using connection %s', conn) with closing(conn.cursor()) as cursor: logger.info('using cursor %s', cursor) cursor.execute(QUERY) for chembl_id, name in cursor.fetchall(): # TODO add xrefs to smiles, inchi, inchikey here xrefs = [] yield Term( reference=Reference(prefix=PREFIX, identifier=chembl_id, name=name), xrefs=xrefs, )
def iter_terms(version: str) -> Iterable[Term]: """Iterate over ChEMBL compounds.""" with chembl_downloader.connect(version=version) as conn: logger.info("using connection %s", conn) with closing(conn.cursor()) as cursor: logger.info("using cursor %s", cursor) cursor.execute(QUERY) for chembl_id, name, smiles, inchi, inchi_key in cursor.fetchall(): # TODO add xrefs? term = Term.from_triple(prefix=PREFIX, identifier=chembl_id, name=name) if smiles: term.append_property("smiles", smiles) if inchi: term.append_property("inchi", inchi) if inchi_key: term.append_xref(Reference("inchikey", inchi_key)) yield term
def iter_terms(version: str) -> Iterable[Term]: """Iterate over terms from GWAS Central Phenotype.""" for n in trange(1, 11000, desc=f'{PREFIX} download'): try: path = ensure_path( PREFIX, 'phenotype', version=version, url=f'https://www.gwascentral.org/phenotype/HGVPM{n}?format=json', name=f'HGVPM{n}.json', ) except OSError as e: tqdm.write(f'{n}: {e}') continue with open(path) as file: j = json.load(file) term = Term( reference=Reference(PREFIX, j['identifier'], j['name']), definition=j['description'].strip().replace('\n', ' '), ) yield term
def iter_terms(version: str, force: bool = False) -> Iterable[Term]: """Get ITIS terms.""" zip_path = ensure_path(PREFIX, url=URL, force=force, version=version) sqlite_dir = prefix_directory_join(PREFIX, version=version) sqlite_path = prefix_directory_join(PREFIX, name="itis.sqlite", version=version) if not os.path.exists(sqlite_path): with zipfile.ZipFile(zip_path) as zip_file: for file in zip_file.filelist: if file.filename.endswith(".sqlite") and not file.is_dir(): zip_file.extract(file, sqlite_dir) shutil.move(os.path.join(sqlite_dir, file.filename), sqlite_path) os.rmdir(os.path.join(sqlite_dir, os.path.dirname(file.filename))) if not os.path.exists(sqlite_path): raise FileNotFoundError(f"file missing: {sqlite_path}") conn = sqlite3.connect(sqlite_path.as_posix()) with closing(conn.cursor()) as cursor: cursor.execute(LONGNAMES_QUERY) id_to_reference = { str(identifier): Reference(prefix=PREFIX, identifier=str(identifier), name=name) for identifier, name in cursor.fetchall() } with closing(conn.cursor()) as cursor: cursor.execute(HIERARCHY_QUERY) id_to_parents = multidict((str(child), str(parent)) for child, parent in cursor.fetchall()) for identifier, reference in id_to_reference.items(): parents = [] for parent_identifier in id_to_parents.get(identifier, []): if parent_identifier == "0": # this means it's a plant continue parents.append(id_to_reference[parent_identifier]) term = Term( reference=reference, parents=parents, ) yield term
def iter_terms() -> Iterable[Term]: """Get ITIS terms.""" zip_path = ensure_path(PREFIX, URL) sqlite_path = prefix_directory_join(PREFIX, 'itisSqlite043020', 'ITIS.sqlite') if not os.path.exists(sqlite_path): with zipfile.ZipFile(zip_path, 'r') as zip_ref: zip_ref.extractall(get_prefix_directory(PREFIX)) if not os.path.exists(sqlite_path): raise FileNotFoundError(f'file missing: {sqlite_path}') conn = sqlite3.connect(sqlite_path) with closing(conn.cursor()) as cursor: cursor.execute(LONGNAMES_QUERY) id_to_reference = { str(identifier): Reference(prefix=PREFIX, identifier=str(identifier), name=name) for identifier, name in cursor.fetchall() } with closing(conn.cursor()) as cursor: cursor.execute(HIERARCHY_QUERY) id_to_parents = multidict( (str(child), str(parent)) for child, parent in cursor.fetchall()) for identifier, reference in id_to_reference.items(): parents = [] for parent_identifier in id_to_parents.get(identifier, []): if parent_identifier == '0': # this means its a plant continue parents.append(id_to_reference[parent_identifier]) term = Term( reference=reference, parents=parents, ) yield term
def _parse_xrefs(s) -> List[Tuple[Reference, str]]: if pd.isna(s): return [] rv = [] for xref in s.split("|"): xref = xref.replace("protein ontology:PR:", "PR:") xref = xref.replace("protein ontology:PR_", "PR:") try: xref_curie, note = xref.split("(") except ValueError: logger.warning("xref missing (: %s", xref) continue note = note.rstrip(")") try: reference = Reference.from_curie(xref_curie) except ValueError: logger.warning("can not parse CURIE: %s", xref) continue if reference is None: logger.warning("reference is None after parsing: %s", xref) continue rv.append((reference, note)) return rv
def _parse_xrefs(s) -> List[Tuple[Reference, str]]: if pd.isna(s): return [] rv = [] for xref in s.split('|'): xref = xref.replace('protein ontology:PR:', 'PR:') xref = xref.replace('protein ontology:PR_', 'PR:') try: xref_curie, note = xref.split('(') except ValueError: logger.warning('xref missing (: %s', xref) continue note = note.rstrip(')') try: reference = Reference.from_curie(xref_curie) except ValueError: logger.warning('can not parse CURIE: %s', xref) continue if reference is None: logger.warning('reference is None after parsing: %s', xref) continue rv.append((reference, note)) return rv