def bens_magical_ontology(use_tqdm: bool = True) -> nx.DiGraph: """Make a super graph containing is_a, part_of, and xref relationships.""" rv = nx.DiGraph() df = ensure_inspector_javert_df() for source_ns, source_id, target_ns, target_id, provenance in df.values: rv.add_edge(f'{source_ns}:{source_id}', f'{target_ns}:{target_id}', relation='xref', provenance=provenance) logger.info('getting hierarchies') it = sorted(bioregistry.read_bioregistry()) if use_tqdm: it = tqdm(it, desc='Entries') for prefix in it: if bioregistry.is_deprecated(prefix) or prefix in SKIP: continue if use_tqdm: it.set_postfix({'prefix': prefix}) hierarchy = get_hierarchy(prefix, include_has_member=True, include_part_of=True) rv.add_edges_from(hierarchy.edges(data=True)) # TODO include translates_to, transcribes_to, and has_variant return rv
def get_prefix_to_miriam_prefix() -> Mapping[str, Tuple[str, str]]: """Get a mapping of bioregistry prefixes to MIRIAM prefixes.""" return { prefix: (entry['miriam']['prefix'], entry['miriam']['namespaceEmbeddedInLui']) for prefix, entry in bioregistry.read_bioregistry().items() if 'miriam' in entry and 'prefix' in entry['miriam'] }
def get_curated_urls() -> Mapping[str, str]: """Get a mapping of prefixes to their custom download URLs.""" #: URLs of resources that weren't listed in OBO Foundry properly return { bioregistry_prefix: bioregistry_entry['download'] for bioregistry_prefix, bioregistry_entry in bioregistry.read_bioregistry().items() if 'download' in bioregistry_entry }
def get_not_available_as_obo(): """Get the list of prefixes not available as OBO.""" #: A list of prefixes that have been manually annotated as not being available in OBO return { bioregistry_prefix for bioregistry_prefix, bioregistry_entry in bioregistry.read_bioregistry().items() if 'not_available_as_obo' in bioregistry_entry and bioregistry_entry['not_available_as_obo'] }
def iter_helper_helper( f: Callable[[str], X], use_tqdm: bool = True, skip_below: Optional[str] = None, skip_pyobo: bool = False, strict: bool = True, ) -> Iterable[Tuple[str, X]]: """Yield all mappings extracted from each database given. :param f: A function that takes a prefix and gives back something that will be used by an outer function. :param strict: If true, will raise exceptions and crash the program instead of logging them. :raises HTTPError: If the resource could not be downloaded :raises URLError: If another problem was encountered during download :raises ValueError: If the data was not in the format that was expected (e.g., OWL) """ it = sorted(bioregistry.read_bioregistry()) if use_tqdm: it = tqdm(it) for prefix in it: if prefix in SKIP: continue if skip_below is not None and prefix < skip_below: continue if skip_pyobo and has_nomenclature_plugin(prefix): continue if use_tqdm: it.set_postfix({'prefix': prefix}) try: mapping = f(prefix) except NoBuild: continue except urllib.error.HTTPError as e: logger.warning('[%s] HTTP %s: unable to download %s', prefix, e.getcode(), e.geturl()) if strict: raise except urllib.error.URLError: logger.warning('[%s] unable to download', prefix) if strict: raise except ValueError as e: if _is_xml(e): # this means that it tried doing parsing on an xml page saying get the f**k out logger.info( 'no resource available for %s. See http://www.obofoundry.org/ontology/%s', prefix, prefix) else: logger.warning('[%s] error while parsing: %s', prefix, e) if strict: raise e else: yield prefix, mapping
def iterate_wikidata_dfs(*, use_tqdm: bool = True) -> Iterable[pd.DataFrame]: """Iterate over WikiData xref dataframes.""" wikidata_properties = { prefix: entry['wikidata']['property'] for prefix, entry in bioregistry.read_bioregistry().items() if 'wikidata' in entry and 'property' in entry['wikidata'] } # wikidata_properties.update(get_wikidata_properties()) it = sorted(wikidata_properties.items()) if use_tqdm: it = tqdm(it, desc='Wikidata properties') for prefix, wikidata_property in it: if prefix in {'pubmed', 'pmc', 'orcid'}: continue # too many try: yield get_wikidata_df(prefix, wikidata_property) except json.decoder.JSONDecodeError as e: logger.warning('[%s] Problem decoding results from %s: %s', prefix, wikidata_property, e)
def iter_helper_helper( f: Callable[[str], X], use_tqdm: bool = True, skip_below: Optional[str] = None, skip_pyobo: bool = False, skip_set: Optional[Set[str]] = None, strict: bool = True, **kwargs, ) -> Iterable[Tuple[str, X]]: """Yield all mappings extracted from each database given. :param f: A function that takes a prefix and gives back something that will be used by an outer function. :param use_tqdm: If true, use the tqdm progress bar :param skip_below: If true, skip sources whose names are less than this (used for iterative curation :param skip_pyobo: If true, skip sources implemented in PyOBO :param skip_set: A pre-defined blacklist to skip :param strict: If true, will raise exceptions and crash the program instead of logging them. :param kwargs: Keyword arguments passed to ``f``. :yields: A prefix and the result of the callable ``f`` :raises TypeError: If a type error is raised, it gets re-raised :raises urllib.error.HTTPError: If the resource could not be downloaded :raises urllib.error.URLError: If another problem was encountered during download :raises ValueError: If the data was not in the format that was expected (e.g., OWL) """ it = sorted(bioregistry.read_bioregistry()) if use_tqdm: it = tqdm(it, disable=None, desc='Resources') for prefix in it: if use_tqdm: it.set_postfix({'prefix': prefix}) if prefix in SKIP: tqdm.write(f'skipping {prefix} because in default skip set') continue if skip_set and prefix in skip_set: tqdm.write(f'skipping {prefix} because in skip set') continue if skip_below is not None and prefix < skip_below: continue if skip_pyobo and has_nomenclature_plugin(prefix): continue try: yv = f(prefix, **kwargs) except NoBuild: continue except urllib.error.HTTPError as e: logger.warning('[%s] HTTP %s: unable to download %s', prefix, e.getcode(), e.geturl()) if strict and not bioregistry.is_deprecated(prefix): raise except urllib.error.URLError: logger.warning('[%s] unable to download', prefix) if strict and not bioregistry.is_deprecated(prefix): raise except MissingPrefix as e: logger.warning('[%s] missing prefix: %s', prefix, e) if strict: raise e except ValueError as e: if _is_xml(e): # this means that it tried doing parsing on an xml page saying get the f**k out logger.info( 'no resource available for %s. See http://www.obofoundry.org/ontology/%s', prefix, prefix) else: logger.exception('[%s] error while parsing: %s', prefix, e.__class__) if strict: raise e except TypeError as e: logger.exception('TypeError on %s', prefix) if strict: raise e else: yield prefix, yv
def load( load_all: bool, load_resources: bool = False, load_names: bool = False, load_alts: bool = False, load_xrefs: bool = True, load_synonyms: bool = False, reset: bool = False, ) -> None: """Load the database.""" if reset: drop_all() create_all() if load_resources or load_all: prefix_to_resource: Dict[str, Resource] = {} prefixes = {resource.prefix for resource in Resource.query.all()} for prefix, entry in tqdm(bioregistry.read_bioregistry().items(), desc='loading resources'): if bioregistry.is_deprecated(prefix): continue if prefix in prefixes: continue prefix_to_resource[prefix] = resource_model = Resource( prefix=prefix, name=entry['name'], pattern=bioregistry.get_pattern(prefix), ) session.add(resource_model) session.commit() ooh_na_na_path = ensure_ooh_na_na() synonyms_path = ensure_synonyms() xrefs_path = ensure_inspector_javert() if load_alts or load_all: alts_path = ensure_alts() alts_df = pd.read_csv(alts_path, sep='\t', dtype=str) # prefix, alt, identifier logger.info('inserting %d alt identifiers', len(alts_df.index)) alts_df.to_sql(name=Alt.__tablename__, con=engine, if_exists='append', index=False) logger.info('committing alt identifier') session.commit() logger.info('done committing alt identifiers') for label, path, table, columns, checker in [ ('names', ooh_na_na_path, Reference, None, load_names), ('synonyms', synonyms_path, Synonym, ['prefix', 'identifier', 'name'], load_synonyms), ('xrefs', xrefs_path, Xref, ['prefix', 'identifier', 'xref_prefix', 'xref_identifier', 'source'], load_xrefs), ]: if not checker and not load_all: continue logger.info('beginning insertion of %s', label) conn = engine.raw_connection() logger.info('inserting with low-level copy of %s from: %s', label, path) if columns: columns = ', '.join(columns) logger.info('corresponding to columns: %s', columns) columns = f' ({columns})' else: columns = '' with conn.cursor() as cursor, gzip.open(path) as file: # next(file) # skip the header sql = f'''COPY {table.__tablename__}{columns} FROM STDIN WITH CSV HEADER DELIMITER E'\\t' QUOTE E'\\b';''' logger.info('running SQL: %s', sql) cursor.copy_expert(sql=sql, file=file) logger.info('committing %s', label) conn.commit() logger.info('done committing %s', label) logger.info(f'number resources loaded: {Resource.query.count():,}') logger.info(f'number references loaded: {Reference.query.count():,}') logger.info(f'number alts loaded: {Alt.query.count():,}') logger.info(f'number synonyms loaded: {Synonym.query.count():,}') logger.info(f'number xrefs loaded: {Xref.query.count():,}')
def _get_map(registry: str) -> Mapping[str, str]: return { prefix: entry[registry]['prefix'] for prefix, entry in bioregistry.read_bioregistry().items() if registry in entry }
def key(self) -> str: """Get the OBO Foundry key.""" return bioregistry.read_bioregistry()[ self.bioregistry_id]['obofoundry']['prefix']
import logging from typing import Iterable, Mapping, Optional, Type, Union import bioregistry from bioregistry.external.ols import get_ols from bioregistry.resolve import _clean_version, get_name from bioversions.utils import Getter, VersionType logger = logging.getLogger(__name__) bioregistry_id_to_ols_id = { bioregistry_id: bioregistry_entry['ols']['prefix'] for bioregistry_id, bioregistry_entry in bioregistry.read_bioregistry().items() if 'ols' in bioregistry_entry } def _get_version_type(bioregistry_id) -> Optional[VersionType]: ols_entry = bioregistry.get(bioregistry_id) ols_version_type = ols_entry.get('ols_version_type') ols_version_date_format = ols_entry.get('ols_version_date_format') if ols_version_date_format: return VersionType.date elif ols_version_type: return getattr(VersionType, ols_version_type) else: logger.warning('[%s] missing version type', bioregistry_id)