Exemplo n.º 1
Exemplo n.º 2
class Graph(object):
	Object that scan an rdf graph for schema definitions (aka 'ontologies') 
	In [1]: import ontospy2
	INFO:rdflib:RDFLib Version: 4.2.0

	In [2]: g = ontospy2.Graph("npgcore_latest.ttl")
	Loaded 3478 triples
	Ontologies found: 1

	def __init__(self, source, text=False, endpoint=False, rdf_format=None):
		Load the graph in memory, then setup all necessary attributes.
		super(Graph, self).__init__() 

		self.rdfgraph = rdflib.Graph()			
		self.graphuri	= None
		self.queryHelper = None # instantiated after we have a graph
		self.ontologies = []
		self.classes = []	
		self.namespaces = []
		self.properties = [] 
		self.annotationProperties = [] 
		self.objectProperties = []
		self.datatypeProperties = []
		self.toplayer = []
		self.toplayerProperties = []
		# keep track of the rdf source		
		self.IS_ENDPOINT = False
		self.IS_FILE = False
		self.IS_URL = False
		self.IS_TEXT = False
		# finally		
		self.__loadRDF(source, text, endpoint, rdf_format)
		# extract entities into

	def __repr__(self):
		return "<OntoSPy Graph (%d triples)>" % (len(self.rdfgraph))

	def __loadRDF(self, source, text, endpoint, rdf_format):
		After a graph has been loaded successfully, set up all params
		if text:
			self.IS_TEXT = True
			rdf_format = rdf_format or "turtle"
		elif endpoint:
			self.IS_ENDPOINT = True
			# replace graph with ConjunctiveGraph
			self.rdfgraph = rdflib.ConjunctiveGraph(store=SPARQLStore(source))			
			self.graphuri = source	# default uri is www location


			if type(source) == type("string"):
				self.IS_URL = True				
				if source.startswith("www."): #support for lazy people
					source = "http://%s" % str(source)
				self.graphuri = source	# default uri is www location
				rdf_format = rdf_format or guess_fileformat(source)

			elif type(source) == file:
				self.IS_FILE = True				
				self.graphuri = source.name # default uri is filename
				rdf_format = rdf_format or guess_fileformat(source.name)
				raise Exception("You passed an unknown object. Only URIs and files are accepted.") 

			if self.IS_TEXT:			
				self.rdfgraph.parse(data=source, format=rdf_format)
				printDebug("----------\nLoaded %d triples from text" % len(self.rdfgraph))
			elif self.IS_ENDPOINT:
				printDebug("Accessing SPARQL Endpoint <%s>" % self.graphuri)
				printDebug("(note: support for sparql endpoints is still experimental)")
				self.rdfgraph.parse(source, format=rdf_format)
				printDebug("----------\nLoaded %d triples from <%s>" % (len(self.rdfgraph), self.graphuri))
			# set up the query helper too
			self.queryHelper = QueryHelper(self.rdfgraph)	
			printDebug("\nError Parsing Graph (assuming RDF serialization was *%s*)\n" % (rdf_format))	 

	def serialize(self, rdf_format="turtle"):
		""" Shortcut that outputs the graph """
		return self.rdfgraph.serialize(format=rdf_format)
	def sparql(self, stringa):
		""" wrapper around a sparql query """
		qres = self.rdfgraph.query(stringa)
		return list(qres)

	def __extractNamespaces(self):
		Extract graph namespaces.
		Namespaces are given in this format:

			In [01]: for x in graph.namespaces():
					....:			print x
			('xml', rdflib.URIRef('http://www.w3.org/XML/1998/namespace'))
			('', rdflib.URIRef('http://cohereweb.net/ontology/cohere.owl#'))
			(u'owl', rdflib.URIRef('http://www.w3.org/2002/07/owl#'))
			('rdfs', rdflib.URIRef('http://www.w3.org/2000/01/rdf-schema#'))
			('rdf', rdflib.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#'))
			(u'xsd', rdflib.URIRef('http://www.w3.org/2001/XMLSchema#'))

		We assume that a base namespace is implied by an empty prefix		

		exit = []

		if self.IS_ENDPOINT==True:
			return False

			if self.graphuri not in [y for x,y in self.rdfgraph.namespaces()]:
				# if not base namespace is set, try to simulate one 
				self.rdfgraph.bind("_file_", rdflib.Namespace(self.graphuri))
			self.namespaces = sorted(self.rdfgraph.namespaces())

	# ------------	
	# === main method === #	 
	# ------------
	def _scan(self, source=None, text=False, endpoint=False, rdf_format=None):
		scan a source of RDF triples 
		build all the objects to deal with the ontology/ies pythonically
		In [1]: g.scan("npgcore_latest.ttl")
		Ontologies found: 1
		Out[3]: [<OntoSPy: Ontology object for uri *http://ns.nature.com/terms/*>]
		if source: # add triples dynamically
			self.__loadRDF(source, text, endpoint, rdf_format)
		printDebug("started scanning...\n----------")
		printDebug("Ontologies found: %d" % len(self.ontologies))
		printDebug("Classes found...: %d" % len(self.classes))
		printDebug("Properties found: %d" % len(self.properties))
		printDebug("Annotation......: %d" % len(self.annotationProperties))
		printDebug("Datatype........: %d" % len(self.datatypeProperties))
		printDebug("Object..........: %d" % len(self.objectProperties))


	def __extractOntologies(self, exclude_BNodes = False, return_string=False):
		returns Ontology class instances
        [ a owl:Ontology ;
            vann:preferredNamespacePrefix "bsym" ;
            vann:preferredNamespaceUri "http://bsym.bloomberg.com/sym/" ],
		out = []
		qres = self.queryHelper.getOntology()

		if qres:
			# NOTE: SPARQL returns a list of rdflib.query.ResultRow (~ tuples..)
			for candidate in qres:
				if isBlankNode(candidate[0]):
					if exclude_BNodes:
						checkDC_ID = [x for x in self.rdfgraph.objects(candidate[0], rdflib.namespace.DC.identifier)]
						if checkDC_ID:
							out += [Ontology(checkDC_ID[0])]
							vannprop = rdflib.URIRef("http://purl.org/vocab/vann/preferredNamespaceUri")
							vannpref = rdflib.URIRef("http://purl.org/vocab/vann/preferredNamespacePrefix")
							checkDC_ID = [x for x in self.rdfgraph.objects(candidate[0], vannprop)]
							if checkDC_ID:
								checkDC_prefix = [x for x in self.rdfgraph.objects(candidate[0], vannpref)]
								if checkDC_prefix:
									out += [Ontology(checkDC_ID[0], prefPrefix=checkDC_prefix[0])]
									out += [Ontology(checkDC_ID[0])]
					out += [Ontology(candidate[0])]
			# printDebug("No owl:Ontologies found")
		self.ontologies = out
		# add all annotations/triples
		for onto in self.ontologies:
			onto.triples = self.queryHelper.entityTriples(onto.uri)

	#  RDFS:class vs OWL:class cf. http://www.w3.org/TR/owl-ref/ section 3.1

	def __extractClasses(self):
		2015-06-04: removed sparql 1.1 queries
		2015-05-25: optimized via sparql queries in order to remove BNodes
		2015-05-09: new attempt 
		Note: queryHelper.getAllClasses() returns a list of tuples, 
		(class, classRDFtype) 
		so in some cases that's duplicates if a class is both RDFS.CLass and OWL.Class
		In this case we keep only OWL.Class as it is more informative.
		self.classes = [] # @todo: keep adding? 
		qres = self.queryHelper.getAllClasses()

		for candidate in qres:
			test_existing_cl = self.getClass(uri=candidate[0])
			if not test_existing_cl:
				# create it
				self.classes += [OntoClass(candidate[0], candidate[1], self.namespaces)]
				# update it
				if candidate[1] == rdflib.OWL.Class:
					# prefer OWL.Class over RDFS.Class
					test_existing_cl.rdftype = rdflib.OWL.Class 
		#add more data
		for aClass in self.classes:
			aClass.triples = self.queryHelper.entityTriples(aClass.uri)
			aClass._buildGraph() # force construction of mini graph
			aClass.queryHelper = self.queryHelper
			# attach to an ontology 
			for uri in aClass.getValuesForProperty(rdflib.RDFS.isDefinedBy):
				onto = self.getOntology(str(uri))
				if onto:
					onto.classes += [aClass]
					aClass.ontology = onto
			# add direct Supers				
			directSupers = self.queryHelper.getClassDirectSupers(aClass.uri)
			for x in directSupers:
				superclass = self.getClass(uri=x[0])
				if superclass: 
					# add inverse relationships (= direct subs for superclass)
					if aClass not in superclass.children():

	def __extractProperties(self):
		2015-06-04: removed sparql 1.1 queries
		2015-06-03: analogous to get classes	
		# instantiate properties making sure duplicates are pruned
		# but the most specific rdftype is kept 
		# eg OWL:ObjectProperty over RDF:property
		self.properties = [] # @todo: keep adding? 
		self.annotationProperties = [] 
		self.objectProperties = []
		self.datatypeProperties = [] 
		qres = self.queryHelper.getAllProperties()
		for candidate in qres:

			test_existing_prop = self.getProperty(uri=candidate[0])
			if not test_existing_prop:
				# create it
				self.properties += [OntoProperty(candidate[0], candidate[1], self.namespaces)]
				# update it
				if candidate[1] and (test_existing_prop.rdftype == rdflib.RDF.Property):
					test_existing_prop.rdftype = inferMainPropertyType(candidate[1])

		#add more data
		for aProp in self.properties:
			if aProp.rdftype == rdflib.OWL.DatatypeProperty:
				self.datatypeProperties += [aProp]
			elif aProp.rdftype == rdflib.OWL.AnnotationProperty:
				self.annotationProperties += [aProp]
			elif aProp.rdftype == rdflib.OWL.ObjectProperty:
				self.objectProperties += [aProp]
			aProp.triples = self.queryHelper.entityTriples(aProp.uri)
			aProp._buildGraph() # force construction of mini graph

			# attach to an ontology [2015-06-15: no property type distinction yet]
			for uri in aProp.getValuesForProperty(rdflib.RDFS.isDefinedBy):
				onto = self.getOntology(str(uri))
				if onto:
					onto.properties += [aProp]
					aProp.ontology = onto
			# add direct Supers				
			directSupers = self.queryHelper.getPropDirectSupers(aProp.uri)
			for x in directSupers:
				superprop = self.getProperty(uri=x[0])
				if superprop: 
					# add inverse relationships (= direct subs for superprop)
					if aProp not in superprop.children():

	def getClass(self, id=None, uri=None, match=None):
		get the saved-class with given ID or via other methods...
		Note: it tries to guess what is being passed..
		In [1]: g.getClass(uri='http://www.w3.org/2000/01/rdf-schema#Resource')
		Out[1]: <Class *http://www.w3.org/2000/01/rdf-schema#Resource*>
		In [2]: g.getClass(10)
		Out[2]: <Class *http://purl.org/ontology/bibo/AcademicArticle*> 

		In [3]: g.getClass(match="person")
		[<Class *http://purl.org/ontology/bibo/PersonalCommunicationDocument*>,
		 <Class *http://purl.org/ontology/bibo/PersonalCommunication*>,
		 <Class *http://xmlns.com/foaf/0.1/Person*>]
		if not id and not uri and not match:
			return None
		if type(id) == type("string"):
			uri = id
			id = None
			if not uri.startswith("http://"):
				match = uri
				uri = None
		if match:
			if type(match) != type("string"):
				return []
			res = []
			for x in self.classes:
				if match.lower() in x.uri.lower():
					res += [x]
			return res
			for x in self.classes:
				if id and x.id == id:
					return x
				if uri and x.uri.lower() == uri.lower():
					return x
			return None

	def getProperty(self, id=None, uri=None, match=None):
		get the saved-class with given ID or via other methods...
		Note: analogous to getClass method		
		if not id and not uri and not match:
			return None
		if type(id) == type("string"):
			uri = id
			id = None
			if not uri.startswith("http://"):
				match = uri
				uri = None
		if match:
			if type(match) != type("string"):
				return []
			res = []
			for x in self.properties:
				if match.lower() in x.uri.lower():
					res += [x]
			return res
			for x in self.properties:
				if id and x.id == id:
					return x
				if uri and x.uri.lower() == uri.lower():
					return x
			return None

	def getOntology(self, id=None, uri=None, match=None):
		get the saved-ontology with given ID or via other methods...	
		if not id and not uri and not match:
			return None
		if type(id) == type("string"):
			uri = id
			id = None
			if not uri.startswith("http://"):
				match = uri
				uri = None
		if match:
			if type(match) != type("string"):
				return []
			res = []
			for x in self.ontologies:
				if match.lower() in x.uri.lower():
					res += [x]
			return res
			for x in self.ontologies:
				if id and x.id == id:
					return x
				if uri and x.uri.lower() == uri.lower():
					return x
			return None

	def __computeTopLayer(self):

		exit = []
		for c in self.classes:
			if not c.parents():
				exit += [c]
		self.toplayer = exit # sorted(exit, key=lambda x: x.id) # doesnt work

		# properties 
		exit = []
		for c in self.properties:
			if not c.parents():
				exit += [c]
		self.toplayerProperties = exit # sorted(exit, key=lambda x: x.id) # doesnt work

	def printClassTree(self, element = None, showids=True, labels=False):
		Print nicely into stdout the class tree of an ontology 
		Note: indentation is made so that ids up to 3 digits fit in, plus a space.
		if not element:	 # first time
			for x in self.toplayer:
				printGenericTree(x, 0, showids, labels)
			printGenericTree(element, 0, showids, labels)		

	def printPropertyTree(self, element = None, level=0, showids=True, labels=False):
		Print nicely into stdout the property tree of an ontology 
		Note: indentation is made so that ids up to 3 digits fit in, plus a space.
		if not element:	 # first time
			for x in self.toplayerProperties:
				printGenericTree(x, 0, level, showids)
			printGenericTree(element, 0, showids, labels)




	def __buildDomainRanges(self, aProp):			
		extract domain/range details and add to Python objects
		domains = aProp.rdfgraph.objects(None, rdflib.RDFS.domain)
		ranges =  aProp.rdfgraph.objects(None, rdflib.RDFS.range)
		for x in domains:
			if not isBlankNode(x):
				aClass = self.getClass(uri=str(x))
				if aClass:
					aProp.domains += [aClass]
					aClass.domain_of += [aProp]
					aProp.domains += [x]  # edge case: it's not an OntoClass instance?
		for x in ranges:
			if not isBlankNode(x):
				aClass = self.getClass(uri=str(x))
				if aClass:
					aProp.ranges += [aClass]
					aClass.range_of += [aProp]
					aProp.ranges += [x]