예제 #1
0
파일: pyRDFa.py 프로젝트: 15831944/virtuoso
def processString (file, base = "") :
    extras         = []
    warnings       = False
    space_preserve = True
    xhtml	   = True
    lax	           = True
    options = Options(warnings=warnings,
				  space_preserve=space_preserve,
				  transformers=extras,
				  xhtml=xhtml,
				  lax=lax)
    return _processString(file, "xml", options, base, False)
예제 #2
0
파일: rdflib.py 프로젝트: gromgull/pyrdfa3
	def parse(self, source, graph,
			  pgraph                 = None,
			  embedded_rdf           = True,
			  vocab_expansion        = False,
			  vocab_cache            = False,
			  rdfOutput              = False) :
		"""
		@param source: one of the input sources that the RDFLib package defined
		@type source: InputSource class instance
		@param graph: target graph for the triples; output graph, in RDFa spec. parlance
		@type graph: RDFLib Graph
		@keyword pgraph: target for error and warning triples; processor graph, in RDFa spec. parlance. If set to None, these triples are ignored
		@type pgraph: RDFLib Graph
		@keyword embedded_rdf: some formats allow embedding RDF in other formats: (X)HTML can contain turtle in a special <script> element, SVG can have RDF/XML embedded in a <metadata> element. This flag controls whether those triples should be interpreted and added to the output graph. Some languages (e.g., SVG) require this, and the flag is ignored.
		@type embedded_rdf: Boolean
		@keyword vocab_expansion: whether the RDFa @vocab attribute should also mean vocabulary expansion (see the RDFa 1.1 spec for further details)
		@type vocab_expansion: Boolean
		@keyword vocab_cache: in case vocab expansion is used, whether the expansion data (i.e., vocabulary) should be cached locally. This requires the ability for the local application to write on the local file system
		@type vocab_chache: Boolean
		@keyword rdfOutput: whether Exceptions should be catched and added, as triples, to the processor graph, or whether they should be raised.
		@type rdfOutput: Boolean
		"""
                if isinstance(source, StringInputSource) :
                        orig_source = source.getByteStream()
                elif isinstance(source, URLInputSource) :
                        orig_source = source.url
                elif isinstance(source, FileInputSource) :
                        orig_source = source.file.name
                        source.file.close()
                baseURI      = source.getPublicId()

                # The RDFa part
                from pyRdfa import pyRdfa, Options				
                self.options = Options(output_processor_graph = (pgraph != None),
                                                           embedded_rdf           = embedded_rdf,
                                                           vocab_expansion        = vocab_expansion,
                                                           vocab_cache            = vocab_cache)

                processor = pyRdfa(self.options, base = baseURI, media_type = 'text/html', rdfa_version = '1.1')
                processor.graph_from_source(orig_source, graph=graph, pgraph=pgraph, rdfOutput = rdfOutput)

                # The Microdata part
                try: 
                    from pyMicrodata import pyMicrodata
                    processor    = pyMicrodata(base = baseURI, vocab_expansion = vocab_expansion, vocab_cache = vocab_cache)
                    processor.graph_from_source(orig_source, graph=graph, rdfOutput = rdfOutput)
                except ImportError:
                    # no pyMicrodata installed!
                    pass
예제 #3
0
파일: rdflib.py 프로젝트: gromgull/pyrdfa3
	def parse(self, source, graph,
			  pgraph                 = None,
			  media_type             = None,
			  rdfa_version           = None,
			  embedded_rdf           = False,
			  vocab_expansion        = False,
			  vocab_cache            = False,
			  rdfOutput              = False) :
		"""
		@param source: one of the input sources that the RDFLib package defined
		@type source: InputSource class instance
		@param graph: target graph for the triples; output graph, in RDFa spec. parlance
		@type graph: RDFLib Graph
		@keyword pgraph: target for error and warning triples; processor graph, in RDFa spec. parlance. If set to None, these triples are ignored
		@type pgraph: RDFLib Graph
		@keyword media_type: explicit setting of the preferred media type (a.k.a. content type) of the the RDFa source. None means the content type of the HTTP result is used, or a guess is made based on the suffix of a file
		@type media_type: string
		@keyword rdfa_version: 1.0 or 1.1. If the value is None, then, by default, 1.1 is used unless the source has explicit signals to use 1.0 (e.g., using a @version attribute, using a DTD set up for 1.0, etc)
		@type rdfa_version: string
		@keyword embedded_rdf: some formats allow embedding RDF in other formats: (X)HTML can contain turtle in a special <script> element, SVG can have RDF/XML embedded in a <metadata> element. This flag controls whether those triples should be interpreted and added to the output graph. Some languages (e.g., SVG) require this, and the flag is ignored.
		@type embedded_rdf: Boolean
		@keyword vocab_expansion: whether the RDFa @vocab attribute should also mean vocabulary expansion (see the RDFa 1.1 spec for further details)
		@type vocab_expansion: Boolean
		@keyword vocab_cache: in case vocab expansion is used, whether the expansion data (i.e., vocabulary) should be cached locally. This requires the ability for the local application to write on the local file system
		@type vocab_chache: Boolean
		@keyword rdfOutput: whether Exceptions should be catched and added, as triples, to the processor graph, or whether they should be raised.
		@type rdfOutput: Boolean
		"""
                from pyRdfa import pyRdfa, Options

                if isinstance(source, StringInputSource) :
                        orig_source = source.getByteStream()
                elif isinstance(source, URLInputSource) :
                        orig_source = source.url
                elif isinstance(source, FileInputSource) :
                        orig_source = source.file.name
                        source.file.close()

                self.options = Options(output_processor_graph = (pgraph != None),
                                                           embedded_rdf           = embedded_rdf,
                                                           vocab_expansion        = vocab_expansion,
                                                           vocab_cache            = vocab_cache)

                baseURI      = source.getPublicId()
                processor    = pyRdfa(self.options, base = baseURI, media_type = media_type, rdfa_version = rdfa_version)
                processor.graph_from_source(orig_source, graph=graph, pgraph=pgraph, rdfOutput = rdfOutput)
예제 #4
0
 def extract_items(self, document, base_url=None, expanded=True):
     options = Options(output_processor_graph=True,
                       embedded_rdf=False,
                       space_preserve=True,
                       vocab_expansion=False,
                       vocab_cache=False,
                       vocab_cache_report=False,
                       refresh_vocab_cache=False,
                       check_lite=False)
     g = PyRdfa(options, base=base_url).graph_from_DOM(document, graph=Graph(), pgraph=Graph())
     jsonld_string = g.serialize(format='json-ld', auto_compact=not expanded).decode('utf-8')
     
     try:
         # hack to fix the ordering of multi-value properties (see issue 116)
         # it should be disabled once PyRDFA fixes itself
         return self._fix_order(jsonld_string, document)
     except:
         return json.loads(jsonld_string)
예제 #5
0
def _parse_rdfa_to_graph(page: StringIO) -> rdflib.Graph:
    options = Options()
    options.host_language = HostLanguage.html5
    processor = rdfa_processor(options)
    return processor.graph_from_source(page)
예제 #6
0
 def __init__(self, session, config, parent):
     BaseParser.__init__(self, session, config, parent)
     rdfaOptions = Options(warnings=False)
     rdfaOptions.warning_graph = None
     self.options = rdfaOptions
예제 #7
0
    def __init__(self,
                 node,
                 graph,
                 inherited_state=None,
                 base="",
                 options=None):
        """
		@param node: the current DOM Node
		@param graph: the RDFLib Graph
		@keyword inherited_state: the state as inherited
		from upper layers. This inherited_state is mixed with the state information
		retrieved from the current node.
		@type inherited_state: L{State.ExecutionContext}
		@keyword base: string denoting the base URI for the specific node. This is
		overridden by a possible C{@xml:base}, but it overrides the possible
		base inherited from the upper layers. Note: C{@xml:base} is not officially part of the
		XHTML+RDFa syntax, but this could/should handle by the DTD validation of the
		incoming document. The code itself is prepared for the C{@xml:base} usage, in 
		accordnace with the reference (in the RDFa syntax document) to other XML dialects that might use it.
		@keyword options: invocation option
		@type options: L{Options<pyRdfa.Options>}
		"""
        #-----------------------------------------------------------------
        # settling the base
        # note that, strictly speaking, it is not necessary to add the base to the
        # context, because there is only one place to set it (<base> element of the <header>).
        # It is done because it is prepared for a possible future change in direction of
        # accepting xml:base on each element.
        # At the moment, it is invoked with a 'None' at the top level of parsing, that is
        # when the <base> element is looked for.
        if inherited_state:
            self.base = inherited_state.base
            self.warning_URI_ref = inherited_state.warning_URI_ref
            self.options = inherited_state.options
        else:
            # this is the branch called from the very top
            self.base = ""
            for bases in node.getElementsByTagName("base"):
                if bases.hasAttribute("href"):
                    self.base = bases.getAttribute("href")
                    continue
            if self.base == "":
                self.base = base
            if node.hasAttribute("xml:base"):
                self.base = node.getAttribute("xml:base")
            self.warning_URI_ref = URIRef(base)
            # this is just to play safe. I believe this branch should actually not happen...
            if options == None:
                from pyRdfa import Options
                self.options = Options()
            else:
                self.options = options

            # check the the presense of the @profile and or @version attribute for the RDFa profile...
            # (Not 100% sure that is necessary...)
            html = node.ownerDocument.documentElement
            if not (html.hasAttribute("version")
                    and RDFa_VERSION == html.getAttribute("version")):
                # see if least the profile has been set

                # Find the <head> element
                head = None
                for index in range(0, html.childNodes.length - 1):
                    if html.childNodes.item(index).nodeName == "head":
                        head = html.childNodes.item(index)
                        break

                if not (head != None and head.hasAttribute("profile")
                        and RDFa_PROFILE
                        in head.getAttribute("profile").strip().split()):
                    self.add_warning(
                        "Neither an RDFa profile nor an RFDa version is set")

        #-----------------------------------------------------------------
        # Settling the language tags
        # check first the lang or xml:lang attribute
        # RDFa does not allow the lang attribute. XHTML5 relies :-( on @lang;
        # I just want to be prepared here...
        if node.hasAttribute("lang"):
            self.lang = node.getAttribute("lang")
            if len(self.lang) == 0: self.lang = None
        elif node.hasAttribute("xml:lang"):
            self.lang = node.getAttribute("xml:lang")
            if len(self.lang) == 0: self.lang = None
        elif inherited_state:
            self.lang = inherited_state.lang
        else:
            self.lang = None

        #-----------------------------------------------------------------
        # Handling namespaces
        # First get the local xmlns declarations/namespaces stuff.
        dict = {}
        for i in range(0, node.attributes.length):
            attr = node.attributes.item(i)
            if attr.name.find('xmlns:') == 0:
                # yep, there is a namespace setting
                key = attr.localName
                if key != "":
                    # exclude the top level xmlns setting...
                    uri = attr.value
                    # 1. create a new Namespace entry
                    ns = Namespace(uri)
                    # 2. 'bind' it in the current graph to
                    # get a nicer output
                    graph.bind(key, uri)
                    # 3. Add an entry to the dictionary
                    dict[key] = ns

        # See if anything has been collected at all.
        # If not, the namespaces of the incoming state is
        # taken over
        self.ns = {}
        if len(dict) == 0 and inherited_state:
            self.ns = inherited_state.ns
        else:
            if inherited_state:
                for k in inherited_state.ns:
                    self.ns[k] = inherited_state.ns[k]
                # copying the newly found namespace, possibly overwriting
                # incoming values
                for k in dict:
                    self.ns[k] = dict[k]
            else:
                self.ns = dict

        # see if the xhtml core vocabulary has been set
        self.xhtml_prefix = None
        for key in self.ns.keys():
            if XHTML_URI == str(self.ns[key]):
                self.xhtml_prefix = key
                break
        if self.xhtml_prefix == None:
            if XHTML_PREFIX not in self.ns:
                self.ns[XHTML_PREFIX] = Namespace(XHTML_URI)
                self.xhtml_prefix = XHTML_PREFIX
            else:
                # the most disagreeable thing, the user has used
                # the prefix for something else...
                self.xhtml_prefix = XHTML_PREFIX + '_' + (
                    "%d" % random.randint(1, 1000))
                self.ns[self.xhtml_prefix] = Namespace(XHTML_URI)
            graph.bind(self.xhtml_prefix, XHTML_URI)

        # extra tricks for unusual usages...
        # if the 'rdf' prefix is not used, it is artificially added...
        if "rdf" not in self.ns:
            self.ns["rdf"] = ns_rdf
        if "rdfs" not in self.ns:
            self.ns["rdfs"] = ns_rdfs

        # Final touch: setting the default namespace...
        if node.hasAttribute("xmlns"):
            self.defaultNS = node.getAttribute("xmlns")
        elif inherited_state and inherited_state.defaultNS != None:
            self.defaultNS = inherited_state.defaultNS
        else:
            self.defaultNS = None
예제 #8
0
    def parse(self, doc):
        # parse to find graph
        graph = Graph()
        data = StringInputSource(doc.data)
        if doc.format:
            graph.parse(data, format=doc.format)
        else:
            graph.parse(data)
        return self.process_graph(graph)


try:
    # Try to use more featureful pyRDFa parser
    from pyRdfa import parseRDFa, Options
    rdfaOptions = Options(warnings=False)
    rdfaOptions.warning_graph = None

    class RdfAParser(RdfLibParser):
        def parse(self, doc):
            root = minidom.parse(doc)
            graph = parseRDFa(root, doc.uri, options=rdfaOptions)
            return self.process_graph(graph)

except ImportError:
    # No pyRdfa lib, default to using rdflib's parser

    class RdfAParser(RdfLibParser):
        pass

예제 #9
0
    
    def parse(self, doc):
        # parse to find graph
        graph =  Graph()
        data = StringInputSource(doc.data)
        if doc.format:
            graph.parse(data, format=doc.format)
        else:
            graph.parse(data)
        return self.process_graph(graph)


try:
    # Try to use more featureful pyRDFa parser
    from pyRdfa import parseRDFa, Options
    rdfaOptions = Options(warnings=False)
    rdfaOptions.warning_graph = None

    class RdfAParser(RdfLibParser):
        def parse(self, doc):
            root = minidom.parse(doc)
            graph = parseRDFa(root, doc.uri, options=rdfaOptions)
            return self.process_graph(graph)

except ImportError:
    # No pyRdfa lib, default to using rdflib's parser

    class RdfAParser(RdfLibParser):
        pass

예제 #10
0
 def __init__(self, session, config, parent):
     BaseParser.__init__(self, session, config, parent)
     rdfaOptions = Options(warnings=False)
     rdfaOptions.warning_graph = None
     self.options = rdfaOptions