def test_curies(self): """Test correct prefixes and identifiers.""" registry = dict(bioregistry.read_registry()) registry['decopath'] = { } # TODO decopath needs a real resource and an entry in the bioregistry miriam_patterns = { k: re.compile(entry['miriam']['pattern']) for k, entry in registry.items() if 'miriam' in entry } dataframes = { getter.__name__.removeprefix('get_').removesuffix('_df'): getter() for _, getter in DATA } rows = ['Source Resource', 'Source ID', 'Target Resource', 'Target ID'] for name, df in dataframes.items(): with self.subTest(name=name): for i, (source_prefix, source_id, target_prefix, target_id) in enumerate(df[rows].values): self.assertIn(source_prefix, registry.keys()) self.assertNotEqual(source_prefix, 'kegg') self.assertFalse( bioregistry.is_deprecated(source_prefix), msg= f'[{name}, row {i}] deprecated source prefix: {source_prefix}', ) if source_regex := miriam_patterns.get(source_prefix): self.assertRegex( source_id, source_regex, msg= f'[{name}, row {i}] source prefix: {source_prefix}', ) self.assertIn(target_prefix, registry.keys()) self.assertNotEqual(target_prefix, 'kegg') self.assertFalse( bioregistry.is_deprecated(target_prefix), msg= f'[{name}, row {i}] deprecated target prefix: {target_prefix}', ) if target_regex := miriam_patterns.get(target_prefix): self.assertRegex( target_id, target_regex, msg= f'[{name}, row {i}] target prefix: {target_prefix}', )
def bens_magical_ontology(use_tqdm: bool = True) -> nx.DiGraph: """Make a super graph containing is_a, part_of, and xref relationships.""" rv = nx.DiGraph() df = ensure_inspector_javert_df() for source_ns, source_id, target_ns, target_id, provenance in df.values: rv.add_edge( f"{source_ns}:{source_id}", f"{target_ns}:{target_id}", relation="xref", provenance=provenance, ) logger.info("getting hierarchies") it = sorted(bioregistry.read_registry()) if use_tqdm: it = tqdm(it, desc="Entries") for prefix in it: if bioregistry.is_deprecated(prefix) or prefix in SKIP: continue if use_tqdm: it.set_postfix({"prefix": prefix}) hierarchy = get_hierarchy(prefix, include_has_member=True, include_part_of=True) rv.add_edges_from(hierarchy.edges(data=True)) # TODO include translates_to, transcribes_to, and has_variant return rv
def resource(prefix: str): """Serve the a Bioregistry entry page.""" prefix = _normalize_prefix_or_404(prefix, '.' + resource.__name__) if not isinstance(prefix, str): return prefix example = bioregistry.get_example(prefix) return render_template( 'resource.html', prefix=prefix, name=bioregistry.get_name(prefix), example=example, mappings=_get_resource_mapping_rows(prefix), synonyms=bioregistry.get_synonyms(prefix), homepage=bioregistry.get_homepage(prefix), pattern=bioregistry.get_pattern(prefix), version=bioregistry.get_version(prefix), has_terms=bioregistry.has_terms(prefix), obo_download=bioregistry.get_obo_download(prefix), owl_download=bioregistry.get_owl_download(prefix), namespace_in_lui=bioregistry.namespace_in_lui(prefix), deprecated=bioregistry.is_deprecated(prefix), contact=bioregistry.get_email(prefix), banana=bioregistry.get_banana(prefix), description=bioregistry.get_description(prefix), providers=None if example is None else _get_resource_providers( prefix, example), )
def iter_cached_obo() -> Iterable[Tuple[str, str]]: """Iterate over cached OBO paths.""" for prefix in os.listdir(RAW_DIRECTORY): if prefix in GLOBAL_SKIP or has_no_download( prefix) or bioregistry.is_deprecated(prefix): continue d = os.path.join(RAW_DIRECTORY, prefix) if not os.path.isdir(d): continue for x in os.listdir(d): if x.endswith(".obo"): p = os.path.join(d, x) yield prefix, p
def test_name_expansions(self): """Test that default names are not capital acronyms.""" for prefix in bioregistry.read_registry(): if bioregistry.is_deprecated(prefix): continue entry = bioregistry.get(prefix) if 'name' in entry: continue name = bioregistry.get_name(prefix) if prefix == name.lower() and name.upper() == name: with self.subTest(prefix=prefix): self.fail(msg=f'{prefix} acronym ({name}) is not expanded') if '.' in prefix and prefix.split('.')[0] == name.lower(): with self.subTest(prefix=prefix): self.fail(msg=f'{prefix} acronym ({name}) is not expanded')
def test_no_redundant_acronym(self): """Test that there is no redundant acronym in the name. For example, "Amazon Standard Identification Number (ASIN)" is a problematic name for prefix "asin". """ for prefix in bioregistry.read_registry(): if bioregistry.is_deprecated(prefix): continue entry = bioregistry.get(prefix) if 'name' in entry: continue name = bioregistry.get_name(prefix) try: _, rest = name.rstrip(')').rsplit('(', 1) except ValueError: continue if rest.lower() == prefix.lower(): with self.subTest(prefix=prefix): self.fail(msg=f'{prefix} has redundany acronym in name "{name}"')
def load( load_all: bool, load_resources: bool = False, load_names: bool = False, load_alts: bool = False, load_xrefs: bool = True, load_synonyms: bool = False, reset: bool = False, ) -> None: """Load the database.""" if reset: drop_all() create_all() if load_resources or load_all: prefix_to_resource: Dict[str, Resource] = {} prefixes = {resource.prefix for resource in Resource.query.all()} for prefix, entry in tqdm(bioregistry.read_registry().items(), desc="loading resources"): if bioregistry.is_deprecated(prefix): continue if prefix in prefixes: continue prefix_to_resource[prefix] = resource_model = Resource( prefix=prefix, name=entry["name"], pattern=bioregistry.get_pattern(prefix), ) session.add(resource_model) session.commit() ooh_na_na_path = ensure_ooh_na_na() synonyms_path = ensure_synonyms() xrefs_path = ensure_inspector_javert() if load_alts or load_all: alts_path = ensure_alts() alts_df = pd.read_csv(alts_path, sep="\t", dtype=str) # prefix, alt, identifier logger.info("inserting %d alt identifiers", len(alts_df.index)) alts_df.to_sql(name=Alt.__tablename__, con=engine, if_exists="append", index=False) logger.info("committing alt identifier") session.commit() logger.info("done committing alt identifiers") for label, path, table, columns, checker in [ ("names", ooh_na_na_path, Reference, None, load_names), ("synonyms", synonyms_path, Synonym, ["prefix", "identifier", "name"], load_synonyms), ( "xrefs", xrefs_path, Xref, [ "prefix", "identifier", "xref_prefix", "xref_identifier", "source" ], load_xrefs, ), ]: if not checker and not load_all: continue logger.info("beginning insertion of %s", label) conn = engine.raw_connection() logger.info("inserting with low-level copy of %s from: %s", label, path) if columns: columns = ", ".join(columns) logger.info("corresponding to columns: %s", columns) columns = f" ({columns})" else: columns = "" with conn.cursor() as cursor, gzip.open(path) as file: # next(file) # skip the header sql = f"""COPY {table.__tablename__}{columns} FROM STDIN WITH CSV HEADER DELIMITER E'\\t' QUOTE E'\\b';""" logger.info("running SQL: %s", sql) cursor.copy_expert(sql=sql, file=file) logger.info("committing %s", label) conn.commit() logger.info("done committing %s", label) logger.info(f"number resources loaded: {Resource.query.count():,}") logger.info(f"number references loaded: {Reference.query.count():,}") logger.info(f"number alts loaded: {Alt.query.count():,}") logger.info(f"number synonyms loaded: {Synonym.query.count():,}") logger.info(f"number xrefs loaded: {Xref.query.count():,}")
def iter_helper_helper( f: Callable[[str], X], use_tqdm: bool = True, skip_below: Optional[str] = None, skip_below_inclusive: bool = True, skip_pyobo: bool = False, skip_set: Optional[Set[str]] = None, strict: bool = True, **kwargs, ) -> Iterable[Tuple[str, X]]: """Yield all mappings extracted from each database given. :param f: A function that takes a prefix and gives back something that will be used by an outer function. :param use_tqdm: If true, use the tqdm progress bar :param skip_below: If true, skip sources whose names are less than this (used for iterative curation :param skip_pyobo: If true, skip sources implemented in PyOBO :param skip_set: A pre-defined blacklist to skip :param strict: If true, will raise exceptions and crash the program instead of logging them. :param kwargs: Keyword arguments passed to ``f``. :yields: A prefix and the result of the callable ``f`` :raises TypeError: If a type error is raised, it gets re-raised :raises urllib.error.HTTPError: If the resource could not be downloaded :raises urllib.error.URLError: If another problem was encountered during download :raises ValueError: If the data was not in the format that was expected (e.g., OWL) """ prefixes = list( _prefixes( skip_set=skip_set, skip_below=skip_below, skip_pyobo=skip_pyobo, skip_below_inclusive=skip_below_inclusive, )) prefix_it = tqdm(prefixes, disable=not use_tqdm, desc=f"Building with {f.__name__}()", unit="resource") for prefix in prefix_it: prefix_it.set_postfix(prefix=prefix) try: yv = f(prefix, **kwargs) # type:ignore except urllib.error.HTTPError as e: logger.warning("[%s] HTTP %s: unable to download %s", prefix, e.getcode(), e.geturl()) if strict and not bioregistry.is_deprecated(prefix): raise except urllib.error.URLError: logger.warning("[%s] unable to download", prefix) if strict and not bioregistry.is_deprecated(prefix): raise except MissingPrefix as e: logger.warning("[%s] missing prefix: %s", prefix, e) if strict and not bioregistry.is_deprecated(prefix): raise e except subprocess.CalledProcessError: logger.warning("[%s] ROBOT was unable to convert OWL to OBO", prefix) except UnhandledFormat as e: logger.warning("[%s] %s", prefix, e) except ValueError as e: if _is_xml(e): # this means that it tried doing parsing on an xml page logger.info( "no resource available for %s. See http://www.obofoundry.org/ontology/%s", prefix, prefix, ) else: logger.exception("[%s] got exception %s while parsing", prefix, e.__class__.__name__) except TypeError as e: logger.exception("[%s] got exception %s while parsing", prefix, e.__class__.__name__) if strict: raise e else: yield prefix, yv
def iter_helper_helper( f: Callable[[str], X], use_tqdm: bool = True, skip_below: Optional[str] = None, skip_pyobo: bool = False, skip_set: Optional[Set[str]] = None, strict: bool = True, **kwargs, ) -> Iterable[Tuple[str, X]]: """Yield all mappings extracted from each database given. :param f: A function that takes a prefix and gives back something that will be used by an outer function. :param use_tqdm: If true, use the tqdm progress bar :param skip_below: If true, skip sources whose names are less than this (used for iterative curation :param skip_pyobo: If true, skip sources implemented in PyOBO :param skip_set: A pre-defined blacklist to skip :param strict: If true, will raise exceptions and crash the program instead of logging them. :param kwargs: Keyword arguments passed to ``f``. :yields: A prefix and the result of the callable ``f`` :raises TypeError: If a type error is raised, it gets re-raised :raises urllib.error.HTTPError: If the resource could not be downloaded :raises urllib.error.URLError: If another problem was encountered during download :raises ValueError: If the data was not in the format that was expected (e.g., OWL) """ it = sorted(bioregistry.read_bioregistry()) if use_tqdm: it = tqdm(it, disable=None, desc='Resources') for prefix in it: if use_tqdm: it.set_postfix({'prefix': prefix}) if prefix in SKIP: tqdm.write(f'skipping {prefix} because in default skip set') continue if skip_set and prefix in skip_set: tqdm.write(f'skipping {prefix} because in skip set') continue if skip_below is not None and prefix < skip_below: continue if skip_pyobo and has_nomenclature_plugin(prefix): continue try: yv = f(prefix, **kwargs) except NoBuild: continue except urllib.error.HTTPError as e: logger.warning('[%s] HTTP %s: unable to download %s', prefix, e.getcode(), e.geturl()) if strict and not bioregistry.is_deprecated(prefix): raise except urllib.error.URLError: logger.warning('[%s] unable to download', prefix) if strict and not bioregistry.is_deprecated(prefix): raise except MissingPrefix as e: logger.warning('[%s] missing prefix: %s', prefix, e) if strict: raise e except ValueError as e: if _is_xml(e): # this means that it tried doing parsing on an xml page saying get the f**k out logger.info( 'no resource available for %s. See http://www.obofoundry.org/ontology/%s', prefix, prefix) else: logger.exception('[%s] error while parsing: %s', prefix, e.__class__) if strict: raise e except TypeError as e: logger.exception('TypeError on %s', prefix) if strict: raise e else: yield prefix, yv
def _iter_metadata(**kwargs): for prefix, data in iter_helper_helper(get_metadata, **kwargs): version = data["version"] tqdm.write(f"[{prefix}] using version {version}") yield prefix, version, data["date"], bioregistry.is_deprecated(prefix)