def _ontology_local_repo(self): try: stated_repo = Path(self.config['ontology_local_repo']) except (KeyError, TypeError, FileNotFoundError) as e: stated_repo = Path('/dev/null/does-not-exist') maybe_repo = self._maybe_repo if stated_repo.exists(): return stated_repo elif maybe_repo.exists(): return maybe_repo else: maybe_start = Path(__file__).parent.parent.parent.absolute() maybe_base = maybe_start fsroot = Path('/') while maybe_base != fsroot: maybe_repo = maybe_base / self.ontology_repo if maybe_repo.exists(): log.info( tc.blue('INFO:') + f'Ontology repository found at {maybe_repo}') return maybe_repo else: maybe_base = maybe_base.parent else: log.warning( tc.red('WARNING:') + f'No repository found in any parent directory of {maybe_start}' ) return Path('/dev/null/does-not-exist') # seems reaonsable ...
def auth(self): newline = '\n' scopes = list(self._scopes) if self.options.debug: log.debug(f'requesting for scopes:\n{newline.join(scopes)}') service = _get_oauth_service(readonly=self.options.readonly, SCOPES=scopes) # FIXME decouple this ... log.info(f'Auth finished successfully for scopes:\n{newline.join(scopes)}')
def make_triple(id_, field, value, column_to_predicate=_column_to_predicate): if field == 'id': if value.startswith('SCR:'): value = owl.NamedIndividual else: log.info(value) value = owl.Class #if type(value) == bool: #if value: #value = rdflib.Literal(True) #else: #value = rdflib.Literal(False) return id_, column_to_predicate[field], value
def bootstrap_config(): if not devconfig.config_file.exists(): # scigraph api maybe_key = get_api_key() if maybe_key: from pyontutils.scigraph_client import BASEPATH devconfig.scigraph_api = BASEPATH else: devconfig.scigraph_api = devconfig.scigraph_api.default # ontology repo p1 = Path(__file__).resolve().parent.parent.parent p2 = Path(devconfig.git_local_base).resolve().absolute() print(p1, p2) if (p1 / devconfig.ontology_repo).exists(): if p1 != p2: devconfig.git_local_base = p1 else: log.info(f'config already exists at {devconfig.config_file}')
def inner(local_filepath, remote=False): if noneMembers(local_filepath, *bigleaves) or dobig: ext = os.path.splitext(local_filepath)[-1] if ext == '.ttl': infmt = 'turtle' else: log.info((ext, local_filepath)) infmt = None if remote: resp = requests.get( local_filepath ) # TODO nonblocking pull these out, fetch, run inner again until done raw = resp.text.encode() else: try: with open(local_filepath, 'rb') as f: raw = f.read() except FileNotFoundError as e: if local_filepath.startswith('file://'): log.info( f'local_imports has already been run, skipping {local_filepath}' ) return #raise ValueError('local_imports has already been run') from e else: log.exception( e ) # TODO raise a warning if the file cannot be matched # seems like good practice to have any imported ontology under # version control so all imports are guaranteed to have good # provenance and not split the prior informaiton between the # scigraph config and the repository, the repository remains # the source of truth, load.yaml files can then pick a subset # of the properly tracked files to load as they see fit, but # not add to them (at least in pyontutils land) raw = b'' if oo in raw: # we only care if there are imports or an ontology iri scratch = OntGraph() if infmt == 'turtle': data, rest = raw.split(b'###', 1) elif infmt == None: # assume xml xml_tree = etree.parse(BytesIO(raw)) xml_root = xml_tree.getroot() xml_ontology = xml_tree.xpath( "/*[local-name()='RDF']/*[local-name()='Ontology']") xml_root.clear() xml_root.append(xml_ontology[0]) data = etree.tostring(xml_root) scratch.parse(data=data, format=infmt) for s in scratch.subjects(rdf.type, owl.Ontology): triples.add((s, owl.sameAs, rdflib.URIRef(local_filepath))) # somehow this breaks computing the chain #for p in (rdfs.comment, skos.definition, definition, dc.title, rdfs.label): #for o in scratch[s:p]: #triples.add((s, p, o)) for s, o in sorted(scratch.subject_objects(p)): if revert: raise NotImplementedError('TODO') nlfp = o.replace(remote_base, local_base) triples.add((s, p, o)) if 'http://' in local_filepath or 'external' in local_filepath: # FIXME what to do about https used inconsistently :/ if 'external' in local_filepath: imported_iri = rdflib.URIRef( local_filepath.replace( local_base, remote_base)) # inefficient else: imported_iri = rdflib.URIRef(local_filepath) if s != imported_iri: imported_iri_vs_ontology_iri[ imported_iri] = s # kept for the record triples.add((imported_iri, p, s)) # bridge imported != ontology iri if local_base in nlfp and 'file://' not in o: # FIXME file:// should not be slipping through here... scratch.add((s, p, rdflib.URIRef('file://' + nlfp))) scratch.remove((s, p, o)) if nlfp not in done: done.append(nlfp) if local_base in nlfp and 'external' not in nlfp: # skip externals TODO inner(nlfp) elif readonly: # read external imports if 'external' in nlfp: inner(nlfp) else: inner(nlfp, remote=True) if not readonly: _orp = CustomTurtleSerializer.roundtrip_prefixes # FIXME awful hack :/ CustomTurtleSerializer.roundtrip_prefixes = True ttl = scratch.serialize(format='nifttl', encoding='utf-8') CustomTurtleSerializer.roundtrip_prefixes = _orp ndata, comment = ttl.split(b'###', 1) out = ndata + b'###' + rest with open(local_filepath, 'wb') as f: f.write(out)
def __init__(self, zip_location, git_remote, org, git_local, repo_name, branch, commit, remote_base, load_base, graphload_config_template, graphload_ontologies, patch_config, patch, scigraph_commit, post_clone=lambda: None, fix_imports_only=False, check_built=False): date_today = TODAY() load_from_repo = True local_base = jpth(git_local, repo_name) if load_from_repo: repo, nob = self._set_up_repo_state(local_base, git_remote, org, git_local, repo_name, branch, commit, post_clone) ontology_commit = repo.head.object.hexsha[:COMMIT_HASH_HEAD_LEN] else: ontology_commit = 'NONE' config_path, config = self.make_graphload_config( graphload_config_template, graphload_ontologies, zip_location, date_today) config_hash = identity_json(config, sort_lists=True).hex() (graph_path, zip_path, zip_command, wild_zip_path) = self._set_up_paths(zip_location, repo_name, branch, scigraph_commit, ontology_commit, config_hash, date_today) # NOTE config is modified in place ontologies = self.configure_config(config, graph_path, remote_base, local_base, config_path) load_command = load_base.format( config_path=config_path) # 'exit 1' to test log.info(load_command) if load_from_repo: # replace raw github imports with ontology.neuinfor iris to simplify import chain # FIXME this is hardcoded and will not generalize ... fix_imports = ("find " + local_base + ( " -name '*.ttl' -exec sed -i" " 's,<http.\+/ttl/,<http://ontology.neuinfo.org/NIF/ttl/,' {} \;" )) os.system(fix_imports) if load_from_repo and not fix_imports_only: def reset_state(original_branch=nob): repo.git.checkout('--', local_base) original_branch.checkout() else: reset_state = lambda: None with execute_regardless( reset_state ): # FIXME start this immediately after we obtain nob? # main if load_from_repo: if patch: # FIXME TODO XXX does scigraph load from the catalog!??!?? # because it seems like doid loads correctly without using local_versions # which would be cool, if confusing local_versions = tuple(do_patch(patch_config, local_base)) else: local_versions = tuple() itrips = local_imports( remote_base, local_base, ontologies, local_versions=local_versions, dobig=True) # SciGraph doesn't support catalog.xml catalog = make_catalog(itrips) with open(Path(local_base, 'catalog.xml'), 'wt') as f: f.write(catalog) else: itrips = [] pass maybe_zip_path = glob(wild_zip_path) if fix_imports_only: pass elif not maybe_zip_path: if check_built: print('The graph has not been loaded.') raise NotBuiltError('The graph has not been loaded.') #breakpoint() failure = os.system(load_command) if failure: if os.path.exists(graph_path): shutil.rmtree(graph_path) else: os.rename( config_path, # save the config for eaiser debugging graph_path / config_path.name) cpr = config_path.with_suffix(config_path.suffix + '.raw') os.rename(cpr, graph_path / cpr.name) failure = os.system(zip_command) # graphload zip else: zip_path = maybe_zip_path[0] # this way we get the actual date print('Graph already loaded at', zip_path) # this needs to be run when the branch is checked out # FIXME might be worth adding this to the load config? self.ontologies = [ get_iri(load_header(rec['url'])) for rec in config['ontologies'] ] self.zip_path = zip_path self.itrips = itrips self.config = config
def get_records(user=defaults['--user'], host=defaults['--host'], port=defaults['--port'], database=defaults['--database'], field_mapping=_field_mapping): DB_URI = 'mysql+{driver}://{user}:{password}@{host}:{port}/{db}' config = mysql_conn_helper(host, database, user, port) try: engine = create_engine(DB_URI.format(driver='mysqlconnector', **config)) except ModuleNotFoundError: engine = create_engine(DB_URI.format(driver='pymysql', **config)) config = None # all weakrefs should be gone by now? del (config ) # i wonder whether this actually cleans it up when using **config insp = inspect(engine) #names = [c['name'] for c in insp.get_columns('registry')] #resource_columns = [c['name'] for c in insp.get_columns('resource_columns')] #resource_data = [c['name'] for c in insp.get_columns('resource_data')] #resource_fields = [c['name'] for c in insp.get_columns('resource_fields')] #resources = [c['name'] for c in insp.get_columns('resources')] #conn.execute('SELECT * from registry;') if 1: # this if for indentation purposes only #with engine.connect() as conn: conn = engine tables = ('resource_columns', 'resource_data', 'resource_fields', 'resources') data = { t: ([c['name'] for c in insp.get_columns(t)], conn.execute('SELECT * from %s limit 20;' % t).fetchall()) for t in tables } all_fields = [ n[0] for n in conn.execute( 'SELECT distinct(name) FROM resource_fields;').fetchall() ] #query = conn.execute('SELECT r.rid, r.original_id, r.type, rc.name, rc.value from resources as r JOIN' #' resource_columns as rc ON r.id=rc.rid' #' WHERE rc.name IN %s limit 1000;' % str(tuple([n for n in field_mapping if n != 'MULTI']))) # XXX DANGER THIS QUERY IS O(x^n) :x #' ORDER BY r.rid limit 2000;' #query = conn.execute('SELECT r.rid, r.original_id, r.type, rc.name, rc.value from resource_columns as rc JOIN' #' resources as r ON rc.rid=r.id' #' WHERE rc.name IN %s;' % str(tuple([n for n in field_mapping if n != 'MULTI']))) # XXX DANGER why does > 2000 limit break stuff? #join = query.fetchall() #print('running join') log.info('running 1') r_query = conn.execute( 'SELECT id, rid, original_id, type, status FROM resources WHERE id >= 0;' ) # avoid the various test entries :( log.info('fetching 1 ') r = r_query.fetchall() log.info('running 2') rc_query = conn.execute( 'SELECT rid, name, value, version FROM resource_columns as rc WHERE rc.rid >= 0 AND rc.name IN %s;' % str(tuple([n for n in field_mapping if n != 'MULTI']))) log.info('fetching 2') rc = rc_query.fetchall() fixesForResourcesAndColumns(r, rc) records = make_records(r, rc, field_mapping) log.info('Fetching and data prep done.') return records