예제 #1
0
	def convert( self ) :
		"""
		Top level entry to convert and generate all the triples. It finds the top level items,
		and generates triples for each of them; additionally, it generates a top level entry point
		to the items from base in the form of an RDF list.
		"""
		item_list = []
		for top_level_item in self.get_top_level_items() :
			item_list.append( self.generate_triples(top_level_item, Evaluation_Context()) )
		#list = generate_RDF_collection( self.graph, item_list )
		#self.graph.add( (URIRef(self.base),self.ns_md["item"],list) )
		
		# If the vocab expansion is also switched on, this is the time to do it.

		# This is the version with my current proposal: the basic expansion is always there;
		# the follow-your-nose inclusion of vocabulary is optional
		if self.vocabularies_used :
			try :
				try :
					from ..pyRdfa.rdfs.process import MiniOWL, process_rdfa_sem
					from ..pyRdfa.options      import Options
				except :
					from pyRdfa.rdfs.process import MiniOWL, process_rdfa_sem
					from pyRdfa.options      import Options
				# if we did not get here, the pyRdfa package could not be
				# imported. Too bad, but life should go on in the except branch...
				if self.vocab_expansion :
					# This is the full deal
					options = Options(vocab_expansion = self.vocab_expansion, vocab_cache = self.vocab_cache)
					process_rdfa_sem(self.graph, options)
				else :
					MiniOWL(self.graph).closure()
			except :
				pass
예제 #2
0
def extrair_rdfa(url):
    options = Options(embedded_rdf=True)
    #r = requests.get(url)
    #print pyRdfa(options=options).rdf_from_source(url,outputFormat='pretty-xml')
    g1 = pyRdfa(options=options).rdf_from_source(url,
                                                 outputFormat='pretty-xml')
    #print g1#g2 = pyRdfa(options=options).rdf_from_source('http://rbarbosa.me/ex.html',outputFormat='pretty-xml')
    g = Graph()
    g.parse(io.BytesIO(g1))
    return g
예제 #3
0
def check_term(conn, term, predicates):
    cur = conn.cursor()
    html = gizmos.tree.build_tree(cur, "obi", term, predicate_ids=predicates)

    # Create the DOM document element
    parser = html5lib.HTMLParser(
        tree=html5lib.treebuilders.getTreeBuilder("dom"))
    dom = parser.parse(html)

    # get the DOM tree
    top = dom.documentElement

    # Create the initial state (from pyRdfa)
    actual = Graph()
    options = Options(
        output_default_graph=True,
        output_processor_graph=True,
        space_preserve=True,
        transformers=[],
        embedded_rdf=True,
        vocab_expansion=False,
        vocab_cache=True,
        vocab_cache_report=False,
        refresh_vocab_cache=False,
        check_lite=False,
        experimental_features=True,
    )
    state = ExecutionContext(
        top,
        actual,
        base="http://purl.obolibrary.org/obo/",
        options=options,
        rdfa_version="1.1",
    )

    # Add the RDFa to the RDFLib graph (recursive)
    parse_one_node(top, actual, None, state, [])

    expected = Graph()
    if predicates:
        expected.parse(f"tests/resources/obi-tree-{term}-predicates.ttl",
                       format="turtle")
    else:
        expected.parse(f"tests/resources/obi-tree-{term}.ttl", format="turtle")

    compare_graphs(actual, expected)
예제 #4
0
    def parse(self):
        """
		Parse the RDFa input and store the processor and default graphs. The final media type is also updated.
		"""
        transformers = []
        if self.rdfa_lite:
            from pyRdfa.transform.lite import lite_prune
            transformers.append(lite_prune)

        options = Options(output_default_graph=True,
                          output_processor_graph=True,
                          transformers=transformers,
                          vocab_expansion=self.vocab_expansion,
                          embedded_rdf=self.embedded_rdf,
                          add_informational_messages=True)
        processor = pyRdfa(options=options,
                           base=self.base,
                           media_type=self.media_type)
        processor.graph_from_source(self.uri,
                                    graph=self.default_graph,
                                    pgraph=self.processor_graph,
                                    rdfOutput=True)
        # Extracting some parameters for the error messages
        self.processor = processor
예제 #5
0
                output_processor_graph = True
            elif a == "default":
                output_default_graph = True
                output_processor_graph = False
        else:
            usage()
            sys.exit(1)
except:
    usage()
    sys.exit(1)

options = Options(output_default_graph=output_default_graph,
                  output_processor_graph=output_processor_graph,
                  space_preserve=space_preserve,
                  transformers=extras,
                  embedded_rdf=embedded_rdf,
                  vocab_expansion=vocab_expansion,
                  vocab_cache=vocab_cache,
                  vocab_cache_report=vocab_cache_report,
                  refresh_vocab_cache=refresh_vocab_cache)

processor = pyRdfa(options, base)
if len(value) >= 1:
    print processor.rdf_from_sources(value,
                                     outputFormat=format,
                                     rdfOutput=rdfOutput)
else:
    print processor.rdf_from_source(sys.stdin,
                                    outputFormat=format,
                                    rdfOutput=rdfOutput)
예제 #6
0
    def __init__(self,
                 node,
                 graph,
                 inherited_state=None,
                 base="",
                 options=None,
                 rdfa_version=None):
        """
		@param node: the current DOM Node
		@param graph: the RDFLib Graph
		@keyword inherited_state: the state as inherited
		from upper layers. This inherited_state is mixed with the state information
		retrieved from the current node.
		@type inherited_state: L{state.ExecutionContext}
		@keyword base: string denoting the base URI for the specific node. This overrides the possible
		base inherited from the upper layers. The 
		current XHTML+RDFa syntax does not allow the usage of C{@xml:base}, but SVG1.2 does, so this is
		necessary for SVG (and other possible XML dialects that accept C{@xml:base})
		@keyword options: invocation options, and references to warning graphs
		@type options: L{Options<pyRdfa.options>}
		"""
        def remove_frag_id(uri):
            """
			The fragment ID for self.base must be removed
			"""
            try:
                # To be on the safe side:-)
                t = urlparse(uri)
                return urlunparse((t[0], t[1], t[2], t[3], t[4], ""))
            except:
                return uri

        # This is, conceptually, an additional class initialization, but it must be done run time, otherwise import errors show up
        if len(ExecutionContext._resource_type) == 0:
            ExecutionContext._resource_type = {
                "href": ExecutionContext._URI,
                "src": ExecutionContext._URI,
                "vocab": ExecutionContext._URI,
                "about": ExecutionContext._CURIEorURI,
                "resource": ExecutionContext._CURIEorURI,
                "rel": ExecutionContext._TERMorCURIEorAbsURI,
                "rev": ExecutionContext._TERMorCURIEorAbsURI,
                "datatype": ExecutionContext._TERMorCURIEorAbsURI,
                "typeof": ExecutionContext._TERMorCURIEorAbsURI,
                "property": ExecutionContext._TERMorCURIEorAbsURI,
                "role": ExecutionContext._TERMorCURIEorAbsURI,
            }
        #-----------------------------------------------------------------
        self.node = node

        #-----------------------------------------------------------------
        # Settling the base. In a generic XML, xml:base should be accepted at all levels (though this is not the
        # case in, say, XHTML...)
        # At the moment, it is invoked with a 'None' at the top level of parsing, that is
        # when the <base> element is looked for (for the HTML cases, that is)
        if inherited_state:
            self.rdfa_version = inherited_state.rdfa_version
            self.base = inherited_state.base
            self.options = inherited_state.options

            self.list_mapping = inherited_state.list_mapping
            self.new_list = False

            # for generic XML versions the xml:base attribute should be handled
            if self.options.host_language in accept_xml_base and node.hasAttribute(
                    "xml:base"):
                self.base = remove_frag_id(node.getAttribute("xml:base"))
        else:
            # this is the branch called from the very top
            self.list_mapping = ListStructure()
            self.new_list = True

            if rdfa_version is not None:
                self.rdfa_version = rdfa_version
            else:
                from pyRdfa import rdfa_current_version
                self.rdfa_version = rdfa_current_version

            # This value can be overwritten by a @version attribute
            if node.hasAttribute("version"):
                top_version = node.getAttribute("version")
                if top_version.find("RDFa 1.0") != -1 or top_version.find(
                        "RDFa1.0") != -1:
                    self.rdfa_version = "1.0"
                elif top_version.find("RDFa 1.1") != -1 or top_version.find(
                        "RDFa1.1") != -1:
                    self.rdfa_version = "1.1"

            # this is just to play safe. I believe this should actually not happen...
            if options == None:
                from pyRdfa import Options
                self.options = Options()
            else:
                self.options = options

            self.base = ""
            # handle the base element case for HTML
            if self.options.host_language in [
                    HostLanguage.xhtml, HostLanguage.html5, HostLanguage.xhtml5
            ]:
                for bases in node.getElementsByTagName("base"):
                    if bases.hasAttribute("href"):
                        self.base = remove_frag_id(bases.getAttribute("href"))
                        continue
            elif self.options.host_language in accept_xml_base and node.hasAttribute(
                    "xml:base"):
                self.base = remove_frag_id(node.getAttribute("xml:base"))

            # If no local setting for base occurs, the input argument has it
            if self.base == "":
                self.base = base

            # Perform an extra beautification in RDFLib
            if self.options.host_language in beautifying_prefixes:
                dict = beautifying_prefixes[self.options.host_language]
                for key in dict:
                    graph.bind(key, dict[key])

            input_info = "Input Host Language:%s, RDFa version:%s, base:%s" % (
                self.options.host_language, self.rdfa_version, self.base)
            self.options.add_info(input_info)

        #-----------------------------------------------------------------
        # this will be used repeatedly, better store it once and for all...
        self.parsedBase = urlsplit(self.base)

        #-----------------------------------------------------------------
        # generate and store the local CURIE handling class instance
        self.term_or_curie = TermOrCurie(self, graph, inherited_state)

        #-----------------------------------------------------------------
        # Settling the language tags
        # @lang has priority over @xml:lang
        # it is a bit messy: the three fundamental modes (xhtml, html, or xml) are all slightly different:-(
        # first get the inherited state's language, if any
        if inherited_state:
            self.lang = inherited_state.lang
        else:
            self.lang = None

        self.supress_lang = False

        if self.options.host_language in [
                HostLanguage.xhtml, HostLanguage.xhtml5, HostLanguage.html5
        ]:
            # we may have lang and xml:lang
            if node.hasAttribute("lang"):
                lang = node.getAttribute("lang").lower()
            else:
                lang = None
            if node.hasAttribute("xml:lang"):
                xmllang = node.getAttribute("xml:lang").lower()
            else:
                xmllang = None
            # First of all, set the value, if any
            if xmllang != None:
                # this has priority
                if len(xmllang) != 0:
                    self.lang = xmllang
                else:
                    self.lang = None
            elif lang != None:
                if len(lang) != 0:
                    self.lang = lang
                else:
                    self.lang = None
            # Ideally, a warning should be generated if lang and xmllang are both present with different values. But
            # the HTML5 Parser does its magic by overriding a lang value if xmllang is present, so the potential
            # error situations are simply swallowed...

        elif self.options.host_language in accept_xml_lang and node.hasAttribute(
                "xml:lang"):
            self.lang = node.getAttribute("xml:lang").lower()
            if len(self.lang) == 0: self.lang = None

        #-----------------------------------------------------------------
        # Set the default namespace. Used when generating XML Literals
        if node.hasAttribute("xmlns"):
            self.defaultNS = node.getAttribute("xmlns")
        elif inherited_state and inherited_state.defaultNS != None:
            self.defaultNS = inherited_state.defaultNS
        else:
            self.defaultNS = None
예제 #7
0
파일: process.py 프로젝트: gromgull/pyrdfa3
def return_graph(uri, options, newCache=False):
    """Parse a file, and return an RDFLib Graph. The URI's content type is checked and either one of
	RDFLib's parsers is invoked (for the Turtle, RDF/XML, and N Triple cases) or a separate RDFa processing is invoked
	on the RDFa content.
			
	The Accept header of the HTTP request gives a preference to Turtle, followed by RDF/XML and then HTML (RDFa), in case content negotiation is used.
	
	This function is used to retreive the vocabulary file and turn it into an RDFLib graph.
	
	@param uri: URI for the graph
	@param options: used as a place where warnings can be sent
	@param newCache: in case this is used with caching, whether a new cache is generated; that modifies the warning text
	@return: A tuple consisting of an RDFLib Graph instance and an expiration date); None if the dereferencing or the parsing was unsuccessful
	"""
    def return_to_cache(msg):
        if newCache:
            options.add_warning(err_unreachable_vocab % uri,
                                warning_type=VocabReferenceError)
        else:
            options.add_warning(err_outdated_cache % uri,
                                warning_type=VocabReferenceError)

    retval = None
    expiration_date = None
    content = None

    try:
        content = URIOpener(
            uri, {
                'Accept':
                'text/html;q=0.8, application/xhtml+xml;q=0.8, text/turtle;q=1.0, application/rdf+xml;q=0.9'
            })
    except HTTPError:
        (type, value, traceback) = sys.exc_info()
        return_to_cache(value)
        return (None, None)
    except RDFaError:
        (type, value, traceback) = sys.exc_info()
        return_to_cache(value)
        return (None, None)
    except Exception:
        (type, value, traceback) = sys.exc_info()
        return_to_cache(value)
        return (None, None)

    # Store the expiration date of the newly accessed data
    expiration_date = content.expiration_date

    if content.content_type == MediaTypes.turtle:
        try:
            retval = Graph()
            retval.parse(content.data, format="n3")
        except:
            (type, value, traceback) = sys.exc_info()
            options.add_warning(err_unparsable_Turtle_vocab % (uri, value))
    elif content.content_type == MediaTypes.rdfxml:
        try:
            retval = Graph()
            retval.parse(content.data)
        except:
            (type, value, traceback) = sys.exc_info()
            options.add_warning(err_unparsable_Turtle_vocab % (uri, value))
    elif content.content_type == MediaTypes.nt:
        try:
            retval = Graph()
            retval.parse(content.data, format="nt")
        except:
            (type, value, traceback) = sys.exc_info()
            options.add_warning(err_unparsable_ntriples_vocab % (uri, value))
    elif content.content_type in [
            MediaTypes.xhtml, MediaTypes.html, MediaTypes.xml
    ] or xml_application_media_type.match(content.content_type) != None:
        try:
            from pyRdfa import pyRdfa
            from pyRdfa.options import Options
            options = Options()
            retval = pyRdfa(options).graph_from_source(content.data)
        except:
            (type, value, traceback) = sys.exc_info()
            options.add_warning(err_unparsable_rdfa_vocab % (uri, value))
    else:
        options.add_warning(err_unrecognised_vocab_type %
                            (uri, content.content_type))

    return (retval, expiration_date)
예제 #8
0
            elif a == "default":
                output_default_graph = True
                output_processor_graph = False
        else:
            usage()
            sys.exit(1)
except:
    usage()
    sys.exit(1)

options = Options(output_default_graph=output_default_graph,
                  output_processor_graph=output_processor_graph,
                  space_preserve=space_preserve,
                  transformers=extras,
                  embedded_rdf=embedded_rdf,
                  vocab_expansion=vocab_expansion,
                  vocab_cache=vocab_cache,
                  vocab_cache_report=vocab_cache_report,
                  refresh_vocab_cache=refresh_vocab_cache,
                  check_lite=check_lite,
                  experimental_features=True)

processor = pyRdfa(options, base)
if len(value) >= 1:
    print processor.rdf_from_sources(value,
                                     outputFormat=format,
                                     rdfOutput=rdfOutput)
else:
    print processor.rdf_from_source(sys.stdin,
                                    outputFormat=format,
                                    rdfOutput=rdfOutput)
예제 #9
0
                output_processor_graph = True
            elif a == "default":
                output_default_graph = True
                output_processor_graph = False
        else:
            usage()
            sys.exit(1)
except:
    usage()
    sys.exit(1)

options = Options(output_default_graph=output_default_graph,
                  output_processor_graph=output_processor_graph,
                  space_preserve=space_preserve,
                  vocab_cache_report=vocab_cache_report,
                  bypass_vocab_cache=bypass_vocab_cache,
                  transformers=extras,
                  vocab_expansion=vocab_expansion,
                  vocab_cache=vocab_cache,
                  hturtle=hturtle)

processor = pyRdfa(options, base)
if len(value) >= 1:
    retval = processor.rdf_from_sources(value,
                                        outputFormat=format,
                                        rdfOutput=rdfOutput)
else:
    retval = processor.rdf_from_source(sys.stdin,
                                       outputFormat=format,
                                       rdfOutput=rdfOutput)
예제 #10
0
            (type, value, traceback) = sys.exc_info()
            options.add_warning(err_unparsable_Turtle_vocab % (uri, value))
    elif content.content_type == MediaTypes.nt:
        try:
            retval = Graph()
            retval.parse(content.data, format="nt")
        except:
            (type, value, traceback) = sys.exc_info()
            options.add_warning(err_unparsable_ntriples_vocab % (uri, value))
    elif content.content_type in [
            MediaTypes.xhtml, MediaTypes.html, MediaTypes.xml
    ] or xml_application_media_type.match(content.content_type) != None:
        try:
            from pyRdfa import pyRdfa
            from pyRdfa.options import Options
            options = Options()
            retval = pyRdfa(options).graph_from_source(content.data)
        except:
            (type, value, traceback) = sys.exc_info()
            options.add_warning(err_unparsable_rdfa_vocab % (uri, value))
    else:
        options.add_warning(err_unrecognised_vocab_type %
                            (uri, content.content_type))

    return (retval, expiration_date)


############################################################################################
type = ns_rdf["type"]
Property = ns_rdf["Property"]
Class = ns_rdfs["Class"]