def __init__(self, config, repository, store): self.config = config self.repository = repository self.store = store self._commits = Cache() self._blobs = Cache() self._graphconfigs = Cache()
def testCacheCapacity(self): cache = Cache(capacity=1) cache.set("key1", "value1") cache.set("key2", "value2") with self.assertRaises(KeyError): cache.get("key1") self.assertEqual(cache.get("key2"), "value2") self.assertEqual(cache.size, 1)
def testOverwriteEntry(self): cache = Cache() cache.set("key", "value") cache.set("key", "value2") self.assertEqual(cache.get("key"), "value2") self.assertEqual(cache.size, 1)
def testSetEntry(self): cache = Cache() cache.set("key", "value") self.assertEqual(cache.get("key"), "value") self.assertEqual(cache.size, 1)
def testRemoveEntry(self): cache = Cache() cache.set("key", "value") self.assertEqual(cache.size, 1) cache.remove("key") self.assertEqual(cache.size, 0)
class Quit(object): """Quit object which keeps the store syncronised with the repository.""" gcProcess = None def __init__(self, config, repository, store): self.config = config self.repository = repository self.store = store self._commits = Cache() self._blobs = Cache() self._graphconfigs = Cache() def _exists(self, cid): uri = QUIT['commit-' + cid] for _ in self.store.store.quads((uri, None, None, QUIT.default)): return True return False def getDefaultBranch(self): """Get the default branch for the Git repository which should be used in the application. This will be the default branch as configured, if it is configured or the current HEAD of the repository if the HEAD is born. Will default to "master" Returns: A string containing the branch name. """ config_default_branch = self.config.getDefaultBranch() if config_default_branch: return config_default_branch repository_current_head = self.repository.current_head if repository_current_head: return repository_current_head return "master" def rebuild(self): for context in self.store.contexts(): self.store.remove((None, None, None), context) self.syncAll() def syncAll(self): """Synchronize store with repository data.""" def traverse(commit, seen): commits = [] merges = [] while True: id = commit.id if id in seen: break seen.add(id) if self._exists(id): break commits.append(commit) parents = commit.parents if not parents: break commit = parents[0] if len(parents) > 1: merges.append((len(commits), parents[1:])) for idx, parents in reversed(merges): for parent in parents: commits[idx:idx] = traverse(parent, seen) return commits seen = set() for name in self.repository.tags_or_branches: initial_commit = self.repository.revision(name) commits = traverse(initial_commit, seen) while commits: commit = commits.pop() self.syncSingle(commit) def syncSingle(self, commit): if not self._exists(commit.id): self.changeset(commit) def instance(self, reference, force=False): """Create and return dataset for a given commit id. Args: reference: commit id or reference of the commit to retrieve force: force to get the dataset from the git repository instead of the internal cache Returns: Instance of VirtualGraph representing the respective dataset """ default_graphs = [] commitid = None if reference: commit = self.repository.revision(reference) commitid = commit.id for blob in self.getFilesForCommit(commit): try: (name, oid) = blob (f, context) = self.getFileReferenceAndContext(blob, commit) internal_identifier = context.identifier + '-' + str(oid) if force or not self.config.hasFeature( Feature.Persistence): g = context else: g = RewriteGraph(self.store.store.store, internal_identifier, context.identifier) default_graphs.append(g) except KeyError: pass instance = InMemoryAggregatedGraph(graphs=default_graphs, identifier='default') return VirtualGraph(instance), commitid def changeset(self, commit): if (not self.config.hasFeature(Feature.Persistence)) and ( not self.config.hasFeature(Feature.Provenance)): return g = self.store.store if self.config.hasFeature(Feature.Provenance): role_author_uri = QUIT['Author'] role_committer_uri = QUIT['Committer'] g.add((role_author_uri, is_a, PROV['Role'])) g.add((role_committer_uri, is_a, PROV['Role'])) # Create the commit i1, commitid = self.instance(commit.id, True) commit_uri = QUIT['commit-' + commit.id] if self.config.hasFeature(Feature.Provenance): g.add((commit_uri, is_a, PROV['Activity'])) if 'Source' in commit.properties.keys(): g.add((commit_uri, is_a, QUIT['Import'])) sources = commit.properties['Source'].strip() for source in re.findall("<.*?>", sources): g.add((commit_uri, QUIT['dataSource'], URIRef(source.strip("<>")))) if 'Query' in commit.properties.keys(): g.add((commit_uri, is_a, QUIT['Transformation'])) g.add((commit_uri, QUIT['query'], Literal(commit.properties['Query'].strip()))) g.add((commit_uri, QUIT['hex'], Literal(commit.id))) g.add((commit_uri, PROV['startedAtTime'], Literal(git_timestamp(commit.author.time, commit.author.offset), datatype=XSD.dateTime))) g.add((commit_uri, PROV['endedAtTime'], Literal(git_timestamp(commit.committer.time, commit.committer.offset), datatype=XSD.dateTime))) g.add((commit_uri, RDFS['label'], Literal(commit.message.strip()))) # Author hash = pygit2.hash(commit.author.email).hex author_uri = QUIT['user-' + hash] g.add((commit_uri, PROV['wasAssociatedWith'], author_uri)) g.add((author_uri, is_a, PROV['Agent'])) g.add((author_uri, RDFS.label, Literal(commit.author.name))) g.add((author_uri, FOAF.mbox, Literal(commit.author.email))) q_author_uri = BNode() g.add((commit_uri, PROV['qualifiedAssociation'], q_author_uri)) g.add((q_author_uri, is_a, PROV['Association'])) g.add((q_author_uri, PROV['agent'], author_uri)) g.add((q_author_uri, PROV['role'], role_author_uri)) if commit.author.name != commit.committer.name: # Committer hash = pygit2.hash(commit.committer.email).hex committer_uri = QUIT['user-' + hash] g.add((commit_uri, PROV['wasAssociatedWith'], committer_uri)) g.add((committer_uri, is_a, PROV['Agent'])) g.add((committer_uri, RDFS.label, Literal(commit.committer.name))) g.add((committer_uri, FOAF.mbox, Literal(commit.committer.email))) q_committer_uri = BNode() g.add((commit_uri, PROV['qualifiedAssociation'], q_committer_uri)) g.add((q_committer_uri, is_a, PROV['Association'])) g.add((q_committer_uri, PROV['agent'], author_uri)) g.add((q_committer_uri, PROV['hadRole'], role_committer_uri)) else: g.add((q_author_uri, PROV['hadRole'], role_committer_uri)) # Parents for parent in iter(commit.parents or []): parent_uri = QUIT['commit-' + parent.id] g.add((commit_uri, QUIT["preceedingCommit"], parent_uri)) g.add((commit_uri, PROV["wasInformedBy"], parent_uri)) # Diff parent = next(iter(commit.parents or []), None) i2, commitid = self.instance(parent.id, True) if parent else (None, None) delta = graphdiff(i2.store if i2 else None, i1.store) for index, (iri, changesets) in enumerate(delta.items()): update_uri = QUIT['update-{}-{}'.format(commit.id, index)] g.add((update_uri, QUIT['graph'], iri)) g.add((commit_uri, QUIT['updates'], update_uri)) for (op, triples) in changesets: op_uri = QUIT[op + '-' + commit.id] g.add((update_uri, QUIT[op], op_uri)) g.addN((s, p, o, op_uri) for s, p, o in triples) # Entities if commit.id not in self._graphconfigs: self.updateGraphConfig(commit.id) map = self._graphconfigs.get(commit.id).getgraphurifilemap() for entity in commit.node().entries(recursive=True): # todo check if file was changed if entity.is_file: if entity.name not in map.values(): continue graphUri = self._graphconfigs.get( commit.id).getgraphuriforfile(entity.name) blob = (entity.name, entity.oid) try: f, context = self.getFileReferenceAndContext(blob, commit) except KeyError: graph = Graph(identifier=graphUri) graph.parse(data=entity.content, format='nt') self._blobs.set( blob, (FileReference(entity.name, entity.content), graph)) private_uri = QUIT["graph-{}".format(entity.oid)] if (self.config.hasFeature(Feature.Provenance) or self.config.hasFeature(Feature.Persistence)): g.add((private_uri, is_a, PROV['Entity'])) g.add((private_uri, PROV['specializationOf'], context.identifier)) g.add((private_uri, PROV['wasGeneratedBy'], commit_uri)) q_usage = BNode() g.add((private_uri, PROV['qualifiedGeneration'], q_usage)) g.add((q_usage, is_a, PROV['Generation'])) g.add((q_usage, PROV['activity'], commit_uri)) prev = next(entity.history(), None) if prev: prev_uri = QUIT["graph-{}-{}".format(prev.oid, index)] g.add((private_uri, PROV['wasDerivedFrom'], prev_uri)) g.add((commit_uri, PROV['used'], prev_uri)) q_derivation = BNode() g.add((private_uri, PROV['qualifiedDerivation'], q_derivation)) g.add((q_derivation, is_a, PROV['Derivation'])) g.add((q_derivation, PROV['entity'], prev_uri)) g.add((q_derivation, PROV['hadActivity'], commit_uri)) if self.config.hasFeature(Feature.Persistence): g.addN((s, p, o, private_uri) for s, p, o in context.triples((None, None, None))) def getFilesForCommit(self, commit): """Get all entry, oid tupples for a commit. On Cache miss this method also updates the commits cache. """ if commit is None: return set() if commit.id not in self._commits: if commit.id not in self._graphconfigs: self.updateGraphConfig(commit.id) uriFileMap = self._graphconfigs.get(commit.id).getgraphurifilemap() blobs = set() for entity in commit.node().entries(recursive=True): if entity.is_file: if entity.name not in uriFileMap.values(): continue blob = (entity.name, entity.oid) blobs.add(blob) self._commits.set(commit.id, blobs) return blobs return self._commits.get(commit.id) def getFileReferenceAndContext(self, blob, commit): """Get the FileReference and Context for a given blob (name, oid) of a commit. On Cache miss this method also updates teh commits cache. """ if commit.id not in self._graphconfigs: self.updateGraphConfig(commit.id) if blob not in self._blobs: (name, oid) = blob content = commit.node(path=name).content graphUri = self._graphconfigs.get( commit.id).getgraphuriforfile(name) graph = Graph(identifier=URIRef(graphUri)) graph.parse(data=content, format='nt') quitWorkingData = (FileReference(name, content), graph) self._blobs.set(blob, quitWorkingData) return quitWorkingData return self._blobs.get(blob) def applyQueryOnCommit(self, parsedQuery, parent_commit_ref, target_ref, query=None, default_graph=[], named_graph=[]): """Apply an update query on the graph and the git repository.""" graph, commitid = self.instance(parent_commit_ref) resultingChanges, exception = graph.update(parsedQuery) if exception: # TODO need to revert or invalidate the graph at this point. pass oid = self.commit(graph, resultingChanges, 'New Commit from QuitStore', parent_commit_ref, target_ref, query=query, default_graph=default_graph, named_graph=named_graph) if exception: raise exception return oid def commit(self, graph, delta, message, parent_commit_ref, target_ref, query=None, default_graph=[], named_graph=[], **kwargs): """Commit changes after applying deltas to the blobs. This methods analyzes the delta and applies the changes to the blobs of the repository. A commit message is built with help of message and if called from endpoint with query, default_graph and named_graph. **kwargs can be used to extend the commit message with custom key-value-pairs. Args: graph: the current graph instance delta: delta that will be applied message: commit message parent_commit_ref: the commit-id of preceeding commit target_ref: a ref/branch were the commit will be applied to query: the query that lead to the commit default_graph: using-graph-uri values from SPARQL protocol named_graph: using-named-graph-uri values from SPARQL protocol Returns: The newly created commits id """ if self._isDeltaEmpty(delta): return parent_commit_id = None parent_commit = None blobs = [] blobs_new = set() if parent_commit_ref: parent_commit = self.repository.revision(parent_commit_ref) if parent_commit: parent_commit_id = parent_commit.id try: blobs = self.getFilesForCommit(parent_commit) except KeyError: pass index = self.repository.index(parent_commit_id) if parent_commit_id not in self._graphconfigs: self.updateGraphConfig(parent_commit_id) graphconfig = self._graphconfigs.get(parent_commit_id) known_files = graphconfig.getfiles().keys() blobs_new = self._applyKnownGraphs(delta, blobs, parent_commit, index) new_contexts = self._applyUnknownGraphs(delta, known_files) new_config = copy(graphconfig) for identifier, fileReference in new_contexts.items(): # Add new blobs to repo index.add(fileReference.path, fileReference.content) if graphconfig.mode == 'graphfiles': index.add(fileReference.path + '.graph', identifier + "\n") # Update config new_config.addgraph(identifier, fileReference.path, 'nt') # Update Cache and add new contexts to store blob = fileReference.path, index.stash[fileReference.path][0] self._blobs.set( blob, (fileReference, graph.store.get_context(identifier))) blobs_new.add(blob) if graphconfig.mode == 'configuration': index.add('config.ttl', new_config.graphconf.serialize(format='turtle').decode()) message = self._build_message(message, query, delta, default_graph, named_graph, **kwargs) author = self.repository._repository.default_signature oid = index.commit(message, author.name, author.email, ref=target_ref) if self.config.hasFeature(Feature.GarbageCollection): self.garbagecollection() if oid: self._commits.set(oid.hex, blobs_new) commit = self.repository.revision(oid.hex) self.syncSingle(commit) return oid.hex def _build_message(self, message, query, result, default_graph, named_graph, **kwargs): out = list() if message: out.append(message) out.append('') if query: out.append('Query: "{}"'.format( query.replace("\\", "\\\\").replace("\"", "\\\""))) source = [] operation_types = [] for entry in result: if "type" in entry: operation_types.append(entry["type"]) if entry["type"] == "LOAD": source.append("<{}>".format(entry["source"])) if operation_types: out.append('OperationTypes: "{}"'.format( ",".join(operation_types))) if source: out.append('Source: "{}"'.format(",".join(source))) if isinstance(default_graph, list) and len(default_graph) > 0: out.append('using-graph-uri: {}'.format(', '.join(default_graph))) if isinstance(named_graph, list) and len(named_graph) > 0: out.append('using-named-graph-uri: {}'.format( ', '.join(named_graph))) for k, v in kwargs.items(): out.append('{}: "{}"'.format(k, v.replace('"', "\\\""))) return "\n".join(out) def _applyKnownGraphs(self, delta, blobs, parent_commit, index): blobs_new = set() for blob in blobs: (fileName, oid) = blob try: file_reference, context = self.getFileReferenceAndContext( blob, parent_commit) for entry in delta: changeset = entry['delta'].get(context.identifier, None) if changeset: applyChangeset(file_reference, changeset, context.identifier) del (entry['delta'][context.identifier]) index.add(file_reference.path, file_reference.content) self._blobs.remove(blob) blob = fileName, index.stash[file_reference.path][0] self._blobs.set(blob, (file_reference, context)) blobs_new.add(blob) except KeyError: pass return blobs_new def _applyUnknownGraphs(self, delta, known_blobs): new_contexts = {} for entry in delta: for identifier, changeset in entry['delta'].items(): if isinstance(identifier, BNode) or str(identifier) == 'default': continue # TODO default graph use case if identifier not in new_contexts.keys(): fileName = iri_to_name(identifier) + '.nt' if fileName in known_blobs: reg = re.compile( re.escape(iri_to_name(identifier)) + "_([0-9]+).nt") # n ~ numbers (in blobname), b ~ blobname, m ~ match n = [ int(m.group(1)) for b in known_blobs for m in [reg.search(b)] if m ] + [0] fileName = '{}_{}.nt'.format(iri_to_name(identifier), max(n) + 1) new_contexts[identifier] = FileReference(fileName, '') fileReference = new_contexts[identifier] applyChangeset(fileReference, changeset, identifier) return new_contexts def _isDeltaEmpty(self, result): for entry in result: if "delta" in entry and entry["delta"]: return False return True def garbagecollection(self): """Start garbage collection. Args: commitid: A string cotaining a commitid. """ try: # Check if the garbage collection process is still running if self.gcProcess is None or self.gcProcess.poll() is not None: # Start garbage collection with "--auto" option, # which imidietly terminates, if it is not necessary self.gcProcess = subprocess.Popen( ["git", "gc", "--auto", "--quiet"], cwd=self.repository.path) logger.debug('Spawn git garbage collection.') except Exception as e: logger.debug('Git garbage collection failed to spawn.') logger.debug(e) def updateGraphConfig(self, commitId): """Update the graph configuration for a given commit id.""" graphconf = QuitGraphConfiguration(self.repository._repository) graphconf.initgraphconfig(commitId) self._graphconfigs.set(commitId, graphconf)
class Quit(object): """Quit object which keeps the store syncronised with the repository.""" gcProcess = None def __init__(self, config, repository, store): self.config = config self.repository = repository self.store = store self._commits = Cache() self._blobs = Cache() def _exists(self, cid): uri = QUIT['commit-' + cid] for _ in self.store.store.quads((uri, None, None, QUIT.default)): return True return False def rebuild(self): for context in self.store.contexts(): self.store.remove((None, None, None), context) self.syncAll() def syncAll(self): """Synchronize store with repository data.""" def traverse(commit, seen): commits = [] merges = [] while True: id = commit.id if id in seen: break seen.add(id) if self._exists(id): break commits.append(commit) parents = commit.parents if not parents: break commit = parents[0] if len(parents) > 1: merges.append((len(commits), parents[1:])) for idx, parents in reversed(merges): for parent in parents: commits[idx:idx] = traverse(parent, seen) return commits seen = set() for name in self.repository.tags_or_branches: initial_commit = self.repository.revision(name) commits = traverse(initial_commit, seen) while commits: commit = commits.pop() self.syncSingle(commit) def syncSingle(self, commit, delta=None): if not self._exists(commit.id): self.changeset(commit, delta) def instance(self, commit_id=None, force=False): """Create and return dataset for a given commit id. Args: id: commit id of the commit to retrieve force: force to get the dataset from the git repository instead of the internal cache Returns: Instance of VirtualGraph representing the respective dataset """ default_graphs = [] if commit_id: commit = self.repository.revision(commit_id) for blob in self.getFilesForCommit(commit): try: (name, oid) = blob f, contexts = self.getFileReferenceAndContext(blob, commit) for context in contexts: internal_identifier = context.identifier + '-' + str( oid) if force or not self.config.hasFeature( Feature.Persistence): g = context else: g = RewriteGraph(self.store.store.store, internal_identifier, context.identifier) default_graphs.append(g) except KeyError: pass instance = InMemoryAggregatedGraph(graphs=default_graphs, identifier='default') return VirtualGraph(instance) def changeset(self, commit, delta=None): if (not self.config.hasFeature(Feature.Persistence)) and ( not self.config.hasFeature(Feature.Provenance)): return g = self.store.store if self.config.hasFeature(Feature.Provenance): role_author_uri = QUIT['Author'] role_committer_uri = QUIT['Committer'] g.add((role_author_uri, is_a, PROV['Role'])) g.add((role_committer_uri, is_a, PROV['Role'])) # Create the commit i1 = self.instance(commit.id, True) commit_uri = QUIT['commit-' + commit.id] if self.config.hasFeature(Feature.Provenance): g.add((commit_uri, is_a, PROV['Activity'])) if 'Source' in commit.properties.keys(): g.add((commit_uri, is_a, QUIT['Import'])) g.add((commit_uri, QUIT['dataSource'], Literal(commit.properties['Source'].strip()))) if 'Query' in commit.properties.keys(): g.add((commit_uri, is_a, QUIT['Transformation'])) g.add((commit_uri, QUIT['query'], Literal(commit.properties['Query'].strip()))) g.add((commit_uri, QUIT['hex'], Literal(commit.id))) g.add((commit_uri, PROV['startedAtTime'], Literal(git_timestamp(commit.author.time, commit.author.offset), datatype=XSD.dateTime))) g.add((commit_uri, PROV['endedAtTime'], Literal(git_timestamp(commit.committer.time, commit.committer.offset), datatype=XSD.dateTime))) g.add((commit_uri, RDFS['label'], Literal(commit.message.strip()))) # Author hash = pygit2.hash(commit.author.email).hex author_uri = QUIT['user-' + hash] g.add((commit_uri, PROV['wasAssociatedWith'], author_uri)) g.add((author_uri, is_a, PROV['Agent'])) g.add((author_uri, RDFS.label, Literal(commit.author.name))) g.add((author_uri, FOAF.mbox, Literal(commit.author.email))) q_author_uri = BNode() g.add((commit_uri, PROV['qualifiedAssociation'], q_author_uri)) g.add((q_author_uri, is_a, PROV['Association'])) g.add((q_author_uri, PROV['agent'], author_uri)) g.add((q_author_uri, PROV['role'], role_author_uri)) if commit.author.name != commit.committer.name: # Committer hash = pygit2.hash(commit.committer.email).hex committer_uri = QUIT['user-' + hash] g.add((commit_uri, PROV['wasAssociatedWith'], committer_uri)) g.add((committer_uri, is_a, PROV['Agent'])) g.add((committer_uri, RDFS.label, Literal(commit.committer.name))) g.add((committer_uri, FOAF.mbox, Literal(commit.committer.email))) q_committer_uri = BNode() g.add((commit_uri, PROV['qualifiedAssociation'], q_committer_uri)) g.add((q_committer_uri, is_a, PROV['Association'])) g.add((q_committer_uri, PROV['agent'], author_uri)) g.add((q_committer_uri, PROV['hadRole'], role_committer_uri)) else: g.add((q_author_uri, PROV['hadRole'], role_committer_uri)) # Parents for parent in iter(commit.parents or []): parent_uri = QUIT['commit-' + parent.id] g.add((commit_uri, QUIT["preceedingCommit"], parent_uri)) g.add((commit_uri, PROV["wasInformedBy"], parent_uri)) # Diff if not delta: parent = next(iter(commit.parents or []), None) i2 = self.instance(parent.id, True) if parent else None delta = graphdiff(i2.store if i2 else None, i1.store) for index, (iri, changesets) in enumerate(delta.items()): update_uri = QUIT['update-{}-{}'.format(commit.id, index)] g.add((update_uri, QUIT['graph'], iri)) g.add((commit_uri, QUIT['updates'], update_uri)) for (op, triples) in changesets: op_uri = QUIT[op + '-' + commit.id] g.add((update_uri, QUIT[op], op_uri)) g.addN((s, p, o, op_uri) for s, p, o in triples) # Entities map = self.config.getgraphurifilemap() for entity in commit.node().entries(recursive=True): # todo check if file was changed if entity.is_file: if entity.name not in map.values(): continue graphUris = self.config.getgraphuriforfile(entity.name) graphsFromConfig = set( (Graph(identifier=i) for i in graphUris)) blob = (entity.name, entity.oid) try: f, contexts = self.getFileReferenceAndContext(blob, commit) except KeyError: tmp = ConjunctiveGraph() tmp.parse(data=entity.content, format='nquads') # Info: currently filter graphs from file that were not defined in config # Todo: is this the wanted behaviour? contexts = set( (context for context in tmp.contexts(None) if context.identifier in map)) | graphsFromConfig self._blobs.set( blob, (FileReference(entity.name, entity.content), contexts)) for index, context in enumerate(contexts): private_uri = QUIT["graph-{}-{}".format(entity.oid, index)] if (self.config.hasFeature(Feature.Provenance) or self.config.hasFeature(Feature.Persistence)): g.add((private_uri, is_a, PROV['Entity'])) g.add((private_uri, PROV['specializationOf'], context.identifier)) g.add( (private_uri, PROV['wasGeneratedBy'], commit_uri)) q_usage = BNode() g.add((private_uri, PROV['qualifiedGeneration'], q_usage)) g.add((q_usage, is_a, PROV['Generation'])) g.add((q_usage, PROV['activity'], commit_uri)) prev = next(entity.history(), None) if prev: prev_uri = QUIT["graph-{}-{}".format( prev.oid, index)] g.add((private_uri, PROV['wasDerivedFrom'], prev_uri)) g.add((commit_uri, PROV['used'], prev_uri)) q_derivation = BNode() g.add((private_uri, PROV['qualifiedDerivation'], q_derivation)) g.add((q_derivation, is_a, PROV['Derivation'])) g.add((q_derivation, PROV['entity'], prev_uri)) g.add((q_derivation, PROV['hadActivity'], commit_uri)) if self.config.hasFeature(Feature.Persistence): g.addN( (s, p, o, private_uri) for s, p, o in context.triples((None, None, None))) def getFilesForCommit(self, commit): """Get all entry, oid tupples for a commit. On Cache miss this method also updates teh commits cache. """ uriFileMap = self.config.getgraphurifilemap() if commit.id not in self._commits: blobs = set() for entity in commit.node().entries(recursive=True): if entity.is_file: if entity.name not in uriFileMap.values(): continue blob = (entity.name, entity.oid) blobs.add(blob) self._commits.set(commit.id, blobs) return blobs return self._commits.get(commit.id) def getFileReferenceAndContext(self, blob, commit): """Get the FielReference and Context for a given blob (name, oid) of a commit. On Cache miss this method also updates teh commits cache. """ uriFileMap = self.config.getgraphurifilemap() if blob not in self._blobs: (name, oid) = blob content = commit.node(path=name).content # content = self.repository._repository[oid].data graphUris = self.config.getgraphuriforfile(name) graphsFromConfig = set((Graph(identifier=i) for i in graphUris)) tmp = ConjunctiveGraph() tmp.parse(data=content, format='nquads') contexts = set( (context for context in tmp.contexts(None) if context.identifier in uriFileMap)) | graphsFromConfig quitWorkingData = (FileReference(name, content), contexts) self._blobs.set(blob, quitWorkingData) return quitWorkingData return self._blobs.get(blob) def commit(self, graph, delta, message, commit_id, ref, **kwargs): def build_message(message, kwargs): out = list() for k, v in kwargs.items(): out.append('{}: "{}"'.format(k, v.replace('"', "\\\""))) if message: out.append('') out.append(message) return "\n".join(out) def _apply(f, changeset, identifier): """Update the FileReference (graph uri) of a file with help of the changeset.""" for (op, triples) in changeset: if op == 'additions': for triple in triples: # the internal _nq serializer appends '\n' line = _nq(triple, identifier).rstrip() f.add(line) elif op == 'removals': for triple in triples: # the internal _nq serializer appends '\n' line = _nq(triple, identifier).rstrip() f.remove(line) if not delta: return commit = self.repository.revision(commit_id) index = self.repository.index(commit.id) blobs_new = set() try: blobs = self.getFilesForCommit(commit) except KeyError: blobs = [] for blob in blobs: (fileName, oid) = blob try: file_reference, contexts = self.getFileReferenceAndContext( blob, commit) for context in contexts: for entry in delta: changeset = entry.get(context.identifier, None) if changeset: _apply(file_reference, changeset, context.identifier) del (entry[context.identifier]) index.add(file_reference.path, file_reference.content) self._blobs.remove(blob) blob = fileName, index.stash[file_reference.path][0] self._blobs.set(blob, (file_reference, contexts)) blobs_new.add(blob) except KeyError: pass unassigned = set() f_name = self.config.getGlobalFile() or 'unassigned.nq' f_new = FileReference(f_name, "") for entry in delta: for identifier, changeset in entry.items(): unassigned.add(graph.store.get_context(identifier)) _apply(f_new, changeset, graph.store.identifier) index.add(f_new.path, f_new.content) blob = f_name, index.stash[f_new.path][0] self._blobs.set(blob, (f_new, unassigned)) blobs_new.add(blob) message = build_message(message, kwargs) author = self.repository._repository.default_signature oid = index.commit(message, author.name, author.email, ref=ref) if self.config.hasFeature(Feature.GarbageCollection): self.garbagecollection() if oid: self._commits.set(oid.hex, blobs_new) commit = self.repository.revision(oid.hex) if not self.repository.is_bare: self.repository._repository.checkout( ref, strategy=pygit2.GIT_CHECKOUT_FORCE) self.syncSingle(commit, delta) def garbagecollection(self): """Start garbage collection. Args: commitid: A string cotaining a commitid. """ try: # Check if the garbage collection process is still running if self.gcProcess is None or self.gcProcess.poll() is not None: # Start garbage collection with "--auto" option, # which imidietly terminates, if it is not necessary self.gcProcess = subprocess.Popen( ["git", "gc", "--auto", "--quiet"], cwd=self.repository.path) logger.debug('Spawn git garbage collection.') except Exception as e: logger.debug('Git garbage collection failed to spawn.') logger.debug(e)
class Quit(object): """Quit object which keeps the store syncronised with the repository.""" gcProcess = None def __init__(self, config, repository, store): self.config = config self.repository = repository self.store = store self._commits = Cache() self._blobs = Cache() self._graphconfigs = Cache() def _exists(self, cid): uri = QUIT['commit-' + cid] for _ in self.store.store.quads((uri, None, None, QUIT.default)): return True return False def getDefaultBranch(self): """Get the default branch for the Git repository which should be used in the application. This will be the default branch as configured, if it is configured or the current HEAD of the repository if the HEAD is born. Will default to "master" Returns: A string containing the branch name. """ config_default_branch = self.config.getDefaultBranch() if config_default_branch: return config_default_branch repository_current_head = self.repository.current_head if repository_current_head: return repository_current_head return "master" def rebuild(self): for context in self.store.contexts(): self.store.remove((None, None, None), context) self.syncAll() def syncAll(self): """Synchronize store with repository data.""" def traverse(commit, seen): commits = [] merges = [] while True: id = commit.id if id in seen: break seen.add(id) if self._exists(id): break commits.append(commit) parents = commit.parents if not parents: break commit = parents[0] if len(parents) > 1: merges.append((len(commits), parents[1:])) for idx, parents in reversed(merges): for parent in parents: commits[idx:idx] = traverse(parent, seen) return commits seen = set() for name in self.repository.tags_or_branches: initial_commit = self.repository.revision(name) commits = traverse(initial_commit, seen) while commits: commit = commits.pop() self.syncSingle(commit) def syncSingle(self, commit): if not self._exists(commit.id): self.changeset(commit) def instance(self, reference, force=False): """Create and return dataset for a given commit id. Args: reference: commit id or reference of the commit to retrieve force: force to get the dataset from the git repository instead of the internal cache Returns: Instance of VirtualGraph representing the respective dataset """ default_graphs = [] commitid = None if reference: commit = self.repository.revision(reference) commitid = commit.id for blob in self.getFilesForCommit(commit): try: (name, oid) = blob (f, context) = self.getFileReferenceAndContext(blob, commit) internal_identifier = context.identifier + '-' + str(oid) if force or not self.config.hasFeature(Feature.Persistence): g = context else: g = RewriteGraph( self.store.store.store, internal_identifier, context.identifier ) default_graphs.append(g) except KeyError: pass instance = InMemoryAggregatedGraph( graphs=default_graphs, identifier='default') return VirtualGraph(instance), commitid def changeset(self, commit): if ( not self.config.hasFeature(Feature.Persistence) ) and ( not self.config.hasFeature(Feature.Provenance) ): return g = self.store.store if self.config.hasFeature(Feature.Provenance): role_author_uri = QUIT['Author'] role_committer_uri = QUIT['Committer'] g.add((role_author_uri, is_a, PROV['Role'])) g.add((role_committer_uri, is_a, PROV['Role'])) # Create the commit i1, commitid = self.instance(commit.id, True) commit_uri = QUIT['commit-' + commit.id] if self.config.hasFeature(Feature.Provenance): g.add((commit_uri, is_a, PROV['Activity'])) if 'Source' in commit.properties.keys(): g.add((commit_uri, is_a, QUIT['Import'])) sources = commit.properties['Source'].strip() for source in re.findall("<.*?>", sources): g.add((commit_uri, QUIT['dataSource'], URIRef(source.strip("<>")))) if 'Query' in commit.properties.keys(): g.add((commit_uri, is_a, QUIT['Transformation'])) g.add((commit_uri, QUIT['query'], Literal( commit.properties['Query'].strip()))) g.add((commit_uri, QUIT['hex'], Literal(commit.id))) g.add((commit_uri, PROV['startedAtTime'], Literal( git_timestamp(commit.author.time, commit.author.offset), datatype=XSD.dateTime))) g.add((commit_uri, PROV['endedAtTime'], Literal( git_timestamp(commit.committer.time, commit.committer.offset), datatype=XSD.dateTime))) g.add((commit_uri, RDFS['label'], Literal(commit.message.strip()))) # Author hash = pygit2.hash(commit.author.email).hex author_uri = QUIT['user-' + hash] g.add((commit_uri, PROV['wasAssociatedWith'], author_uri)) g.add((author_uri, is_a, PROV['Agent'])) g.add((author_uri, RDFS.label, Literal(commit.author.name))) g.add((author_uri, FOAF.mbox, Literal(commit.author.email))) q_author_uri = BNode() g.add((commit_uri, PROV['qualifiedAssociation'], q_author_uri)) g.add((q_author_uri, is_a, PROV['Association'])) g.add((q_author_uri, PROV['agent'], author_uri)) g.add((q_author_uri, PROV['role'], role_author_uri)) if commit.author.name != commit.committer.name: # Committer hash = pygit2.hash(commit.committer.email).hex committer_uri = QUIT['user-' + hash] g.add((commit_uri, PROV['wasAssociatedWith'], committer_uri)) g.add((committer_uri, is_a, PROV['Agent'])) g.add((committer_uri, RDFS.label, Literal(commit.committer.name))) g.add((committer_uri, FOAF.mbox, Literal(commit.committer.email))) q_committer_uri = BNode() g.add( (commit_uri, PROV['qualifiedAssociation'], q_committer_uri)) g.add((q_committer_uri, is_a, PROV['Association'])) g.add((q_committer_uri, PROV['agent'], author_uri)) g.add((q_committer_uri, PROV['hadRole'], role_committer_uri)) else: g.add((q_author_uri, PROV['hadRole'], role_committer_uri)) # Parents for parent in iter(commit.parents or []): parent_uri = QUIT['commit-' + parent.id] g.add((commit_uri, QUIT["preceedingCommit"], parent_uri)) g.add((commit_uri, PROV["wasInformedBy"], parent_uri)) # Diff parent = next(iter(commit.parents or []), None) i2, commitid = self.instance(parent.id, True) if parent else (None, None) delta = graphdiff(i2.store if i2 else None, i1.store) for index, (iri, changesets) in enumerate(delta.items()): update_uri = QUIT['update-{}-{}'.format(commit.id, index)] g.add((update_uri, QUIT['graph'], iri)) g.add((commit_uri, QUIT['updates'], update_uri)) for (op, triples) in changesets: op_uri = QUIT[op + '-' + commit.id] g.add((update_uri, QUIT[op], op_uri)) g.addN((s, p, o, op_uri) for s, p, o in triples) # Entities if commit.id not in self._graphconfigs: self.updateGraphConfig(commit.id) map = self._graphconfigs.get(commit.id).getgraphurifilemap() for entity in commit.node().entries(recursive=True): # todo check if file was changed if entity.is_file: if entity.name not in map.values(): continue graphUri = self._graphconfigs.get(commit.id).getgraphuriforfile(entity.name) blob = (entity.name, entity.oid) try: f, context = self.getFileReferenceAndContext(blob, commit) except KeyError: graph = Graph(identifier=graphUri) graph.parse(data=entity.content, format='nt') self._blobs.set( blob, (FileReference(entity.name, entity.content), graph) ) private_uri = QUIT["graph-{}".format(entity.oid)] if ( self.config.hasFeature(Feature.Provenance) or self.config.hasFeature(Feature.Persistence) ): g.add((private_uri, is_a, PROV['Entity'])) g.add( (private_uri, PROV['specializationOf'], context.identifier)) g.add( (private_uri, PROV['wasGeneratedBy'], commit_uri)) q_usage = BNode() g.add((private_uri, PROV['qualifiedGeneration'], q_usage)) g.add((q_usage, is_a, PROV['Generation'])) g.add((q_usage, PROV['activity'], commit_uri)) prev = next(entity.history(), None) if prev: prev_uri = QUIT["graph-{}-{}".format(prev.oid, index)] g.add((private_uri, PROV['wasDerivedFrom'], prev_uri)) g.add((commit_uri, PROV['used'], prev_uri)) q_derivation = BNode() g.add((private_uri, PROV['qualifiedDerivation'], q_derivation)) g.add((q_derivation, is_a, PROV['Derivation'])) g.add((q_derivation, PROV['entity'], prev_uri)) g.add((q_derivation, PROV['hadActivity'], commit_uri)) if self.config.hasFeature(Feature.Persistence): g.addN((s, p, o, private_uri) for s, p, o in context.triples((None, None, None))) def getFilesForCommit(self, commit): """Get all entry, oid tupples for a commit. On Cache miss this method also updates the commits cache. """ if commit is None: return set() if commit.id not in self._commits: if commit.id not in self._graphconfigs: self.updateGraphConfig(commit.id) uriFileMap = self._graphconfigs.get(commit.id).getgraphurifilemap() blobs = set() for entity in commit.node().entries(recursive=True): if entity.is_file: if entity.name not in uriFileMap.values(): continue blob = (entity.name, entity.oid) blobs.add(blob) self._commits.set(commit.id, blobs) return blobs return self._commits.get(commit.id) def getFileReferenceAndContext(self, blob, commit): """Get the FileReference and Context for a given blob (name, oid) of a commit. On Cache miss this method also updates teh commits cache. """ if commit.id not in self._graphconfigs: self.updateGraphConfig(commit.id) if blob not in self._blobs: (name, oid) = blob content = commit.node(path=name).content graphUri = self._graphconfigs.get(commit.id).getgraphuriforfile(name) graph = Graph(identifier=URIRef(graphUri)) graph.parse(data=content, format='nt') quitWorkingData = (FileReference(name, content), graph) self._blobs.set(blob, quitWorkingData) return quitWorkingData return self._blobs.get(blob) def applyQueryOnCommit(self, parsedQuery, parent_commit_ref, target_ref, query=None, default_graph=[], named_graph=[]): """Apply an update query on the graph and the git repository.""" graph, commitid = self.instance(parent_commit_ref) resultingChanges, exception = graph.update(parsedQuery) if exception: # TODO need to revert or invalidate the graph at this point. pass oid = self.commit(graph, resultingChanges, 'New Commit from QuitStore', parent_commit_ref, target_ref, query=query, default_graph=default_graph, named_graph=named_graph) if exception: raise exception return oid def commit(self, graph, delta, message, parent_commit_ref, target_ref, query=None, default_graph=[], named_graph=[], **kwargs): """Commit changes after applying deltas to the blobs. This methods analyzes the delta and applies the changes to the blobs of the repository. A commit message is built with help of message and if called from endpoint with query, default_graph and named_graph. **kwargs can be used to extend the commit message with custom key-value-pairs. Args: graph: the current graph instance delta: delta that will be applied message: commit message parent_commit_ref: the commit-id of preceeding commit target_ref: a ref/branch were the commit will be applied to query: the query that lead to the commit default_graph: using-graph-uri values from SPARQL protocol named_graph: using-named-graph-uri values from SPARQL protocol Returns: The newly created commits id """ if self._isDeltaEmpty(delta): return parent_commit_id = None parent_commit = None blobs = [] blobs_new = set() if parent_commit_ref: parent_commit = self.repository.revision(parent_commit_ref) if parent_commit: parent_commit_id = parent_commit.id try: blobs = self.getFilesForCommit(parent_commit) except KeyError: pass index = self.repository.index(parent_commit_id) if parent_commit_id not in self._graphconfigs: self.updateGraphConfig(parent_commit_id) graphconfig = self._graphconfigs.get(parent_commit_id) known_files = graphconfig.getfiles().keys() blobs_new = self._applyKnownGraphs(delta, blobs, parent_commit, index) new_contexts = self._applyUnknownGraphs(delta, known_files) new_config = copy(graphconfig) for identifier, fileReference in new_contexts.items(): # Add new blobs to repo index.add(fileReference.path, fileReference.content) if graphconfig.mode == 'graphfiles': index.add(fileReference.path + '.graph', identifier + "\n") # Update config new_config.addgraph(identifier, fileReference.path, 'nt') # Update Cache and add new contexts to store blob = fileReference.path, index.stash[fileReference.path][0] self._blobs.set(blob, (fileReference, graph.store.get_context(identifier))) blobs_new.add(blob) if graphconfig.mode == 'configuration': index.add('config.ttl', new_config.graphconf.serialize(format='turtle').decode()) message = self._build_message(message, query, delta, default_graph, named_graph, **kwargs) author = self.repository._repository.default_signature oid = index.commit(message, author.name, author.email, ref=target_ref) if self.config.hasFeature(Feature.GarbageCollection): self.garbagecollection() if oid: self._commits.set(oid.hex, blobs_new) commit = self.repository.revision(oid.hex) self.syncSingle(commit) return oid.hex def _build_message(self, message, query, result, default_graph, named_graph, **kwargs): out = list() if message: out.append(message) out.append('') if query: out.append('Query: "{}"'.format(query.replace("\\", "\\\\").replace("\"", "\\\""))) source = [] operation_types = [] for entry in result: if "type" in entry: operation_types.append(entry["type"]) if entry["type"] == "LOAD": source.append("<{}>".format(entry["source"])) if operation_types: out.append('OperationTypes: "{}"'.format(",".join(operation_types))) if source: out.append('Source: "{}"'.format(",".join(source))) if isinstance(default_graph, list) and len(default_graph) > 0: out.append('using-graph-uri: {}'.format(', '.join(default_graph))) if isinstance(named_graph, list) and len(named_graph) > 0: out.append('using-named-graph-uri: {}'.format(', '.join(named_graph))) for k, v in kwargs.items(): out.append('{}: "{}"'.format(k, v.replace('"', "\\\""))) return "\n".join(out) def _applyKnownGraphs(self, delta, blobs, parent_commit, index): blobs_new = set() for blob in blobs: (fileName, oid) = blob try: file_reference, context = self.getFileReferenceAndContext(blob, parent_commit) for entry in delta: changeset = entry['delta'].get(context.identifier, None) if changeset: applyChangeset(file_reference, changeset, context.identifier) del(entry['delta'][context.identifier]) index.add(file_reference.path, file_reference.content) self._blobs.remove(blob) blob = fileName, index.stash[file_reference.path][0] self._blobs.set(blob, (file_reference, context)) blobs_new.add(blob) except KeyError: pass return blobs_new def _applyUnknownGraphs(self, delta, known_blobs): new_contexts = {} for entry in delta: for identifier, changeset in entry['delta'].items(): if isinstance(identifier, BNode) or str(identifier) == 'default': continue # TODO default graph use case if identifier not in new_contexts.keys(): fileName = iri_to_name(identifier) + '.nt' if fileName in known_blobs: reg = re.compile(re.escape(iri_to_name(identifier)) + "_([0-9]+).nt") # n ~ numbers (in blobname), b ~ blobname, m ~ match n = [ int(m.group(1)) for b in known_blobs for m in [reg.search(b)] if m ] + [0] fileName = '{}_{}.nt'.format(iri_to_name(identifier), max(n)+1) new_contexts[identifier] = FileReference(fileName, '') fileReference = new_contexts[identifier] applyChangeset(fileReference, changeset, identifier) return new_contexts def _isDeltaEmpty(self, result): for entry in result: if "delta" in entry and entry["delta"]: return False return True def garbagecollection(self): """Start garbage collection. Args: commitid: A string cotaining a commitid. """ try: # Check if the garbage collection process is still running if self.gcProcess is None or self.gcProcess.poll() is not None: # Start garbage collection with "--auto" option, # which imidietly terminates, if it is not necessary self.gcProcess = subprocess.Popen( ["git", "gc", "--auto", "--quiet"], cwd=self.repository.path ) logger.debug('Spawn git garbage collection.') except Exception as e: logger.debug('Git garbage collection failed to spawn.') logger.debug(e) def updateGraphConfig(self, commitId): """Update the graph configuration for a given commit id.""" graphconf = QuitGraphConfiguration(self.repository._repository) graphconf.initgraphconfig(commitId) self._graphconfigs.set(commitId, graphconf)