def _createProvenance(self, result): provdata = IProvenanceData(result) from rdflib import URIRef, Literal, Namespace, Graph from rdflib.namespace import RDF, RDFS, FOAF, DCTERMS, XSD from rdflib.resource import Resource PROV = Namespace(u"http://www.w3.org/ns/prov#") BCCVL = Namespace(u"http://ns.bccvl.org.au/") LOCAL = Namespace(u"urn:bccvl:") graph = Graph() # the user is our agent member = api.user.get_current() username = member.getProperty('fullname') or member.getId() user = Resource(graph, LOCAL['user']) user.add(RDF['type'], PROV['Agent']) user.add(RDF['type'], FOAF['Person']) user.add(FOAF['name'], Literal(username)) user.add(FOAF['mbox'], URIRef('mailto:{}'.format(member.getProperty('email')))) # add software as agent software = Resource(graph, LOCAL['software']) software.add(RDF['type'], PROV['Agent']) software.add(RDF['type'], PROV['SoftwareAgent']) software.add(FOAF['name'], Literal('BCCVL ALA Importer')) # script content is stored somewhere on result and will be exported with zip? # ... or store along with pstats.json ? hidden from user # -> execenvironment after import -> log output? # -> source code ... maybe some link expression? stored on result ? separate entity? activity = Resource(graph, LOCAL['activity']) activity.add(RDF['type'], PROV['Activity']) # TODO: this is rather queued or created time for this activity ... could capture real start time on running status update (or start transfer) now = datetime.now().replace(microsecond=0) activity.add(PROV['startedAtTime'], Literal(now.isoformat(), datatype=XSD['dateTime'])) activity.add(PROV['hasAssociationWith'], user) activity.add(PROV['hasAssociationWith'], software) # add job parameters to activity provdata.data = graph.serialize(format="turtle")
def __iter__(self): """missing docstring.""" for item in self.previous: # check if we have a dataset if item['_type'] not in ('org.bccvl.content.dataset', 'org.bccvl.content.remotedataset'): # not a dataset yield item continue pathkey = self.pathkey(*item.keys())[0] # no path .. can't do anything if not pathkey: yield item continue path = item[pathkey] # Skip the Plone site object itself if not path: yield item continue obj = self.context.unrestrictedTraverse(path.encode().lstrip('/'), None) # FIXME: this is really not a great way to check where to find provenenace data # check if we are inside an experiment (means we import result) if IExperiment.providedBy(self.context.__parent__): # result import context = self.context else: # dataset import? context = obj # TODO: do some sanity checks provdata = IProvenanceData(context) PROV = Namespace(u"http://www.w3.org/ns/prov#") BCCVL = Namespace(u"http://ns.bccvl.org.au/") LOCAL = Namespace(u"urn:bccvl:") graph = Graph() graph.parse(data=provdata.data or '', format='turtle') activity = Resource(graph, LOCAL['activity']) # FIXME: shouldn't I use uuid instead of id? entity = Resource(graph, LOCAL[obj.id]) # create this dataset as new entity -> output of activity entity.add(RDF['type'], PROV['Entity']) # generated by entity.add(PROV['wasGeneratedBy'], activity) # PROV['prov:wasAttributedTo'] to user and software? # File metadata entity.add(DCTERMS['creator'], Literal(obj.Creator())) entity.add(DCTERMS['title'], Literal(obj.title)) entity.add(DCTERMS['description'], Literal(obj.description)) entity.add(DCTERMS['rights'], Literal(obj.rights)) if obj.portal_type == 'org.bccvl.content.dataset': entity.add(DCTERMS['format'], Literal(obj.file.contentType)) else: # FIXME: this doesn't seem to do the right thing entity.add(DCTERMS['format'], Literal(obj.format)) # TODO: add metadata about file? # genre, layers, emsc, gcm, year # set activities end time # first one wins if activity.value(PROV['endedAtTime']) is None: activity.add( PROV['endedAtTime'], Literal(datetime.now().replace(microsecond=0).isoformat(), datatype=XSD['dateTime'])) # TODO: extend activity metadata with execution environment data # (logfile import?, pstats import) .. and script + params.json file # ALA import url pd = item.get('_ala_provenance', {}) if pd: entity.add(BCCVL['download_url'], Literal(pd['url'])) # store prov data provdata.data = graph.serialize(format="turtle") yield item
def __iter__(self): """missing docstring.""" for item in self.previous: # check if we have a dataset if item['_type'] not in ('org.bccvl.content.dataset', 'org.bccvl.content.remotedataset'): # not a dataset yield item continue pathkey = self.pathkey(*item.keys())[0] # no path .. can't do anything if not pathkey: yield item continue path = item[pathkey] # Skip the Plone site object itself if not path: yield item continue obj = self.context.unrestrictedTraverse( path.encode().lstrip('/'), None) # FIXME: this is really not a great way to check where to find provenenace data # check if we are inside an experiment (means we import result) if IExperiment.providedBy(self.context.__parent__): # result import context = self.context else: # dataset import? context = obj # TODO: do some sanity checks provdata = IProvenanceData(context) PROV = Namespace(u"http://www.w3.org/ns/prov#") BCCVL = Namespace(u"http://ns.bccvl.org.au/") LOCAL = Namespace(u"urn:bccvl:") graph = Graph() graph.parse(data=provdata.data or '', format='turtle') activity = Resource(graph, LOCAL['activity']) # FIXME: shouldn't I use uuid instead of id? entity = Resource(graph, LOCAL[obj.id]) # create this dataset as new entity -> output of activity entity.add(RDF['type'], PROV['Entity']) # generated by entity.add(PROV['wasGeneratedBy'], activity) # PROV['prov:wasAttributedTo'] to user and software? # File metadata entity.add(DCTERMS['creator'], Literal(obj.Creator())) entity.add(DCTERMS['title'], Literal(obj.title)) entity.add(DCTERMS['description'], Literal(obj.description)) entity.add(DCTERMS['rights'], Literal(obj.rights)) if obj.portal_type == 'org.bccvl.content.dataset': entity.add(DCTERMS['format'], Literal(obj.file.contentType)) else: # FIXME: this doesn't seem to do the right thing entity.add(DCTERMS['format'], Literal(obj.format)) # TODO: add metadata about file? # genre, layers, emsc, gcm, year # set activities end time # first one wins if activity.value(PROV['endedAtTime']) is None: activity.add(PROV['endedAtTime'], Literal(datetime.now().replace(microsecond=0).isoformat(), datatype=XSD['dateTime'])) # TODO: extend activity metadata with execution environment data # (logfile import?, pstats import) .. and script + params.json file # ALA import url pd = item.get('_ala_provenance', {}) if pd: entity.add(BCCVL['download_url'], Literal(pd['url'])) # store prov data provdata.data = graph.serialize(format="turtle") yield item
def _createProvenance(self, result): provdata = IProvenanceData(result) from rdflib import URIRef, Literal, Namespace, Graph from rdflib.namespace import RDF, RDFS, FOAF, DCTERMS, XSD from rdflib.resource import Resource PROV = Namespace(u"http://www.w3.org/ns/prov#") BCCVL = Namespace(u"http://ns.bccvl.org.au/") LOCAL = Namespace(u"urn:bccvl:") graph = Graph() # the user is our agent member = api.user.get_current() username = member.getProperty('fullname') or member.getId() user = Resource(graph, LOCAL['user']) user.add(RDF['type'], PROV['Agent']) user.add(RDF['type'], FOAF['Person']) user.add(FOAF['name'], Literal(username)) user.add(FOAF['mbox'], URIRef('mailto:{}'.format(member.getProperty('email')))) # add software as agent software = Resource(graph, LOCAL['software']) software.add(RDF['type'], PROV['Agent']) software.add(RDF['type'], PROV['SoftwareAgent']) software.add(FOAF['name'], Literal('BCCVL Job Script')) # script content is stored somewhere on result and will be exported with zip? # ... or store along with pstats.json ? hidden from user # -> execenvironment after import -> log output? # -> source code ... maybe some link expression? stored on result ? separate entity? activity = Resource(graph, LOCAL['activity']) activity.add(RDF['type'], PROV['Activity']) # TODO: this is rather queued or created time for this activity ... could capture real start time on running status update (or start transfer) now = datetime.now().replace(microsecond=0) activity.add(PROV['startedAtTime'], Literal(now.isoformat(), datatype=XSD['dateTime'])) activity.add(PROV['hasAssociationWith'], user) activity.add(PROV['hasAssociationWith'], software) # add job parameters to activity for idx, (key, value) in enumerate(result.job_params.items()): param = Resource(graph, LOCAL[u'param_{}'.format(idx)]) activity.add(BCCVL['algoparam'], param) param.add(BCCVL['name'], Literal(key)) # We have only dataset references as parameters if key in ('data_table', ): param.add(BCCVL['value'], LOCAL[dsuuid]) else: param.add(BCCVL['value'], Literal(value)) # iterate over all input datasets and add them as entities for key in ('data_table', ): dsbrain = uuidToCatalogBrain(result.job_params[key]) if not dsbrain: continue ds = dsbrain.getObject() dsprov = Resource(graph, LOCAL[result.job_params[key]]) dsprov.add(RDF['type'], PROV['Entity']) #dsprov.add(PROV['..'], Literal('')) dsprov.add(DCTERMS['creator'], Literal(ds.Creator())) dsprov.add(DCTERMS['title'], Literal(ds.title)) dsprov.add(DCTERMS['description'], Literal(ds.description)) dsprov.add(DCTERMS['rights'], Literal(ds.rights)) # ds.rightsstatement dsprov.add(DCTERMS['format'], Literal(ds.file.contentType)) # location / source # graph.add(uri, DCTERMS['source'], Literal('')) # TODO: genre ... # TODO: resolution # species metadata md = IBCCVLMetadata(ds) # dsprov.add(BCCVL['scientificName'], Literal(md['species']['scientificName'])) # dsprov.add(BCCVL['taxonID'], URIRef(md['species']['taxonID'])) # ... species data, ... species id for layer in md.get('layers_used', ()): dsprov.add(BCCVL['layer'], LOCAL[layer]) # link with activity activity.add(PROV['used'], dsprov) provdata.data = graph.serialize(format="turtle")
def _createProvenance(self, result): provdata = IProvenanceData(result) from rdflib import URIRef, Literal, Namespace, Graph from rdflib.namespace import RDF, RDFS, FOAF, DCTERMS, XSD from rdflib.resource import Resource PROV = Namespace(u"http://www.w3.org/ns/prov#") BCCVL = Namespace(u"http://ns.bccvl.org.au/") LOCAL = Namespace(u"urn:bccvl:") graph = Graph() # the user is our agent member = api.user.get_current() username = member.getProperty('fullname') or member.getId() user = Resource(graph, LOCAL['user']) user.add(RDF['type'], PROV['Agent']) user.add(RDF['type'], FOAF['Person']) user.add(FOAF['name'], Literal(username)) user.add(FOAF['mbox'], URIRef('mailto:{}'.format(member.getProperty('email')))) # add software as agent software = Resource(graph, LOCAL['software']) software.add(RDF['type'], PROV['Agent']) software.add(RDF['type'], PROV['SoftwareAgent']) software.add(FOAF['name'], Literal('BCCVL Job Script')) # script content is stored somewhere on result and will be exported with zip? # ... or store along with pstats.json ? hidden from user # -> execenvironment after import -> log output? # -> source code ... maybe some link expression? stored on result ? separate entity? activity = Resource(graph, LOCAL['activity']) activity.add(RDF['type'], PROV['Activity']) # TODO: this is rather queued or created time for this activity ... could capture real start time on running status update (or start transfer) now = datetime.now().replace(microsecond=0) activity.add(PROV['startedAtTime'], Literal(now.isoformat(), datatype=XSD['dateTime'])) activity.add(PROV['hasAssociationWith'], user) activity.add(PROV['hasAssociationWith'], software) # add job parameters to activity for idx, (key, value) in enumerate(result.job_params.items()): param = Resource(graph, LOCAL[u'param_{}'.format(idx)]) activity.add(BCCVL['algoparam'], param) param.add(BCCVL['name'], Literal(key)) # We have only dataset references as parameters if key in ('data_table',): param.add(BCCVL['value'], LOCAL[dsuuid]) else: param.add(BCCVL['value'], Literal(value)) # iterate over all input datasets and add them as entities for key in ('data_table',): dsbrain = uuidToCatalogBrain(result.job_params[key]) if not dsbrain: continue ds = dsbrain.getObject() dsprov = Resource(graph, LOCAL[result.job_params[key]]) dsprov.add(RDF['type'], PROV['Entity']) #dsprov.add(PROV['..'], Literal('')) dsprov.add(DCTERMS['creator'], Literal(ds.Creator())) dsprov.add(DCTERMS['title'], Literal(ds.title)) dsprov.add(DCTERMS['description'], Literal(ds.description)) dsprov.add(DCTERMS['rights'], Literal(ds.rights)) # ds.rightsstatement dsprov.add(DCTERMS['format'], Literal(ds.file.contentType)) # location / source # graph.add(uri, DCTERMS['source'], Literal('')) # TODO: genre ... # TODO: resolution # species metadata md = IBCCVLMetadata(ds) # dsprov.add(BCCVL['scientificName'], Literal(md['species']['scientificName'])) # dsprov.add(BCCVL['taxonID'], URIRef(md['species']['taxonID'])) # ... species data, ... species id for layer in md.get('layers_used',()): dsprov.add(BCCVL['layer'], LOCAL[layer]) # link with activity activity.add(PROV['used'], dsprov) provdata.data = graph.serialize(format="turtle")