def processString (file, base = "") : extras = [] warnings = False space_preserve = True xhtml = True lax = True options = Options(warnings=warnings, space_preserve=space_preserve, transformers=extras, xhtml=xhtml, lax=lax) return _processString(file, "xml", options, base, False)
def parse(self, source, graph, pgraph = None, embedded_rdf = True, vocab_expansion = False, vocab_cache = False, rdfOutput = False) : """ @param source: one of the input sources that the RDFLib package defined @type source: InputSource class instance @param graph: target graph for the triples; output graph, in RDFa spec. parlance @type graph: RDFLib Graph @keyword pgraph: target for error and warning triples; processor graph, in RDFa spec. parlance. If set to None, these triples are ignored @type pgraph: RDFLib Graph @keyword embedded_rdf: some formats allow embedding RDF in other formats: (X)HTML can contain turtle in a special <script> element, SVG can have RDF/XML embedded in a <metadata> element. This flag controls whether those triples should be interpreted and added to the output graph. Some languages (e.g., SVG) require this, and the flag is ignored. @type embedded_rdf: Boolean @keyword vocab_expansion: whether the RDFa @vocab attribute should also mean vocabulary expansion (see the RDFa 1.1 spec for further details) @type vocab_expansion: Boolean @keyword vocab_cache: in case vocab expansion is used, whether the expansion data (i.e., vocabulary) should be cached locally. This requires the ability for the local application to write on the local file system @type vocab_chache: Boolean @keyword rdfOutput: whether Exceptions should be catched and added, as triples, to the processor graph, or whether they should be raised. @type rdfOutput: Boolean """ if isinstance(source, StringInputSource) : orig_source = source.getByteStream() elif isinstance(source, URLInputSource) : orig_source = source.url elif isinstance(source, FileInputSource) : orig_source = source.file.name source.file.close() baseURI = source.getPublicId() # The RDFa part from pyRdfa import pyRdfa, Options self.options = Options(output_processor_graph = (pgraph != None), embedded_rdf = embedded_rdf, vocab_expansion = vocab_expansion, vocab_cache = vocab_cache) processor = pyRdfa(self.options, base = baseURI, media_type = 'text/html', rdfa_version = '1.1') processor.graph_from_source(orig_source, graph=graph, pgraph=pgraph, rdfOutput = rdfOutput) # The Microdata part try: from pyMicrodata import pyMicrodata processor = pyMicrodata(base = baseURI, vocab_expansion = vocab_expansion, vocab_cache = vocab_cache) processor.graph_from_source(orig_source, graph=graph, rdfOutput = rdfOutput) except ImportError: # no pyMicrodata installed! pass
def parse(self, source, graph, pgraph = None, media_type = None, rdfa_version = None, embedded_rdf = False, vocab_expansion = False, vocab_cache = False, rdfOutput = False) : """ @param source: one of the input sources that the RDFLib package defined @type source: InputSource class instance @param graph: target graph for the triples; output graph, in RDFa spec. parlance @type graph: RDFLib Graph @keyword pgraph: target for error and warning triples; processor graph, in RDFa spec. parlance. If set to None, these triples are ignored @type pgraph: RDFLib Graph @keyword media_type: explicit setting of the preferred media type (a.k.a. content type) of the the RDFa source. None means the content type of the HTTP result is used, or a guess is made based on the suffix of a file @type media_type: string @keyword rdfa_version: 1.0 or 1.1. If the value is None, then, by default, 1.1 is used unless the source has explicit signals to use 1.0 (e.g., using a @version attribute, using a DTD set up for 1.0, etc) @type rdfa_version: string @keyword embedded_rdf: some formats allow embedding RDF in other formats: (X)HTML can contain turtle in a special <script> element, SVG can have RDF/XML embedded in a <metadata> element. This flag controls whether those triples should be interpreted and added to the output graph. Some languages (e.g., SVG) require this, and the flag is ignored. @type embedded_rdf: Boolean @keyword vocab_expansion: whether the RDFa @vocab attribute should also mean vocabulary expansion (see the RDFa 1.1 spec for further details) @type vocab_expansion: Boolean @keyword vocab_cache: in case vocab expansion is used, whether the expansion data (i.e., vocabulary) should be cached locally. This requires the ability for the local application to write on the local file system @type vocab_chache: Boolean @keyword rdfOutput: whether Exceptions should be catched and added, as triples, to the processor graph, or whether they should be raised. @type rdfOutput: Boolean """ from pyRdfa import pyRdfa, Options if isinstance(source, StringInputSource) : orig_source = source.getByteStream() elif isinstance(source, URLInputSource) : orig_source = source.url elif isinstance(source, FileInputSource) : orig_source = source.file.name source.file.close() self.options = Options(output_processor_graph = (pgraph != None), embedded_rdf = embedded_rdf, vocab_expansion = vocab_expansion, vocab_cache = vocab_cache) baseURI = source.getPublicId() processor = pyRdfa(self.options, base = baseURI, media_type = media_type, rdfa_version = rdfa_version) processor.graph_from_source(orig_source, graph=graph, pgraph=pgraph, rdfOutput = rdfOutput)
def extract_items(self, document, base_url=None, expanded=True): options = Options(output_processor_graph=True, embedded_rdf=False, space_preserve=True, vocab_expansion=False, vocab_cache=False, vocab_cache_report=False, refresh_vocab_cache=False, check_lite=False) g = PyRdfa(options, base=base_url).graph_from_DOM(document, graph=Graph(), pgraph=Graph()) jsonld_string = g.serialize(format='json-ld', auto_compact=not expanded).decode('utf-8') try: # hack to fix the ordering of multi-value properties (see issue 116) # it should be disabled once PyRDFA fixes itself return self._fix_order(jsonld_string, document) except: return json.loads(jsonld_string)
def _parse_rdfa_to_graph(page: StringIO) -> rdflib.Graph: options = Options() options.host_language = HostLanguage.html5 processor = rdfa_processor(options) return processor.graph_from_source(page)
def __init__(self, session, config, parent): BaseParser.__init__(self, session, config, parent) rdfaOptions = Options(warnings=False) rdfaOptions.warning_graph = None self.options = rdfaOptions
def __init__(self, node, graph, inherited_state=None, base="", options=None): """ @param node: the current DOM Node @param graph: the RDFLib Graph @keyword inherited_state: the state as inherited from upper layers. This inherited_state is mixed with the state information retrieved from the current node. @type inherited_state: L{State.ExecutionContext} @keyword base: string denoting the base URI for the specific node. This is overridden by a possible C{@xml:base}, but it overrides the possible base inherited from the upper layers. Note: C{@xml:base} is not officially part of the XHTML+RDFa syntax, but this could/should handle by the DTD validation of the incoming document. The code itself is prepared for the C{@xml:base} usage, in accordnace with the reference (in the RDFa syntax document) to other XML dialects that might use it. @keyword options: invocation option @type options: L{Options<pyRdfa.Options>} """ #----------------------------------------------------------------- # settling the base # note that, strictly speaking, it is not necessary to add the base to the # context, because there is only one place to set it (<base> element of the <header>). # It is done because it is prepared for a possible future change in direction of # accepting xml:base on each element. # At the moment, it is invoked with a 'None' at the top level of parsing, that is # when the <base> element is looked for. if inherited_state: self.base = inherited_state.base self.warning_URI_ref = inherited_state.warning_URI_ref self.options = inherited_state.options else: # this is the branch called from the very top self.base = "" for bases in node.getElementsByTagName("base"): if bases.hasAttribute("href"): self.base = bases.getAttribute("href") continue if self.base == "": self.base = base if node.hasAttribute("xml:base"): self.base = node.getAttribute("xml:base") self.warning_URI_ref = URIRef(base) # this is just to play safe. I believe this branch should actually not happen... if options == None: from pyRdfa import Options self.options = Options() else: self.options = options # check the the presense of the @profile and or @version attribute for the RDFa profile... # (Not 100% sure that is necessary...) html = node.ownerDocument.documentElement if not (html.hasAttribute("version") and RDFa_VERSION == html.getAttribute("version")): # see if least the profile has been set # Find the <head> element head = None for index in range(0, html.childNodes.length - 1): if html.childNodes.item(index).nodeName == "head": head = html.childNodes.item(index) break if not (head != None and head.hasAttribute("profile") and RDFa_PROFILE in head.getAttribute("profile").strip().split()): self.add_warning( "Neither an RDFa profile nor an RFDa version is set") #----------------------------------------------------------------- # Settling the language tags # check first the lang or xml:lang attribute # RDFa does not allow the lang attribute. XHTML5 relies :-( on @lang; # I just want to be prepared here... if node.hasAttribute("lang"): self.lang = node.getAttribute("lang") if len(self.lang) == 0: self.lang = None elif node.hasAttribute("xml:lang"): self.lang = node.getAttribute("xml:lang") if len(self.lang) == 0: self.lang = None elif inherited_state: self.lang = inherited_state.lang else: self.lang = None #----------------------------------------------------------------- # Handling namespaces # First get the local xmlns declarations/namespaces stuff. dict = {} for i in range(0, node.attributes.length): attr = node.attributes.item(i) if attr.name.find('xmlns:') == 0: # yep, there is a namespace setting key = attr.localName if key != "": # exclude the top level xmlns setting... uri = attr.value # 1. create a new Namespace entry ns = Namespace(uri) # 2. 'bind' it in the current graph to # get a nicer output graph.bind(key, uri) # 3. Add an entry to the dictionary dict[key] = ns # See if anything has been collected at all. # If not, the namespaces of the incoming state is # taken over self.ns = {} if len(dict) == 0 and inherited_state: self.ns = inherited_state.ns else: if inherited_state: for k in inherited_state.ns: self.ns[k] = inherited_state.ns[k] # copying the newly found namespace, possibly overwriting # incoming values for k in dict: self.ns[k] = dict[k] else: self.ns = dict # see if the xhtml core vocabulary has been set self.xhtml_prefix = None for key in self.ns.keys(): if XHTML_URI == str(self.ns[key]): self.xhtml_prefix = key break if self.xhtml_prefix == None: if XHTML_PREFIX not in self.ns: self.ns[XHTML_PREFIX] = Namespace(XHTML_URI) self.xhtml_prefix = XHTML_PREFIX else: # the most disagreeable thing, the user has used # the prefix for something else... self.xhtml_prefix = XHTML_PREFIX + '_' + ( "%d" % random.randint(1, 1000)) self.ns[self.xhtml_prefix] = Namespace(XHTML_URI) graph.bind(self.xhtml_prefix, XHTML_URI) # extra tricks for unusual usages... # if the 'rdf' prefix is not used, it is artificially added... if "rdf" not in self.ns: self.ns["rdf"] = ns_rdf if "rdfs" not in self.ns: self.ns["rdfs"] = ns_rdfs # Final touch: setting the default namespace... if node.hasAttribute("xmlns"): self.defaultNS = node.getAttribute("xmlns") elif inherited_state and inherited_state.defaultNS != None: self.defaultNS = inherited_state.defaultNS else: self.defaultNS = None
def parse(self, doc): # parse to find graph graph = Graph() data = StringInputSource(doc.data) if doc.format: graph.parse(data, format=doc.format) else: graph.parse(data) return self.process_graph(graph) try: # Try to use more featureful pyRDFa parser from pyRdfa import parseRDFa, Options rdfaOptions = Options(warnings=False) rdfaOptions.warning_graph = None class RdfAParser(RdfLibParser): def parse(self, doc): root = minidom.parse(doc) graph = parseRDFa(root, doc.uri, options=rdfaOptions) return self.process_graph(graph) except ImportError: # No pyRdfa lib, default to using rdflib's parser class RdfAParser(RdfLibParser): pass