예제 #1
0
    def __init__(self, url=None, xml=None):
        if not xml:
            xml = urllib2.urlopen(url).read()

        pyRdfa_options = pyRdfa.Options()

        try:
            dom = minidom.parse(StringIO(xml))
        except:
            parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
            dom = parser.parse(xml, encoding='utf-8')
            pyRdfa_options.host_language = pyRdfa.HTML5_RDFA

        # Workaround the problem that YouTube does not correctly set
        #   the xmlns:og attribute on <html> node; without this,
        #   pyRdfa does not find any OpenGraph tags

        for n in dom.childNodes:
            if n.nodeType == dom.ELEMENT_NODE and n.tagName == 'html':
                if not n.hasAttribute('xmlns:og'):
                    n.setAttributeNS('', 'xmlns:og', OPENGRAPH_NAMESPACES[0])

        self.metadata = {}

        for s, p, o in pyRdfa.parseRDFa(dom, url, options=pyRdfa_options):
            if s.encode('utf-8') != url:
                continue
            k = p.encode('utf-8')
            for ns in OPENGRAPH_NAMESPACES:
                if k.startswith(ns):
                    self.metadata.setdefault(k.replace(ns, ''), o.encode('utf-8'))
예제 #2
0
 def process_document(self, session, doc):
     data = doc.get_raw(session)
     uri = "not-sure-what-to-do-here"
     root = minidom.parse(StringIO.StringIO(data))
     graph = parseRDFa(root, uri, options=self.options)
     rec = GraphRecord(graph)
     return rec
예제 #3
0
 def process_document(self, session, doc):
     data = doc.get_raw(session)
     uri = "not-sure-what-to-do-here"
     root = minidom.parse(StringIO.StringIO(data))
     graph = parseRDFa(root, uri, options=self.options)
     rec = GraphRecord(graph)
     return rec
예제 #4
0
    def parse_string(self, in_string, base_uri, sink=None):

        # extract the RDFa using pyRdfa
        dom, options = self._make_dom(in_string)
        graph = pyRdfa.parseRDFa(dom, base_uri, options=options)

        # see if a default sink is required
        if sink is None:
            sink = DictTripleSink()

        # transform from graph to sink
        self._graph_to_sink(graph, sink)
        del graph

        return sink
예제 #5
0
파일: rdfa.py 프로젝트: cc-archive/rdfadict
    def parse_string(self, in_string, base_uri, sink=None):

        # extract the RDFa using pyRdfa
        dom, options = self._make_dom(in_string)
        graph = pyRdfa.parseRDFa(dom, base_uri, options=options)

        # see if a default sink is required
        if sink is None:
            sink = DictTripleSink()

        # transform from graph to sink
        self._graph_to_sink(graph, sink)
        del graph

        return sink
예제 #6
0
 def parse(self, doc):
     root = minidom.parse(doc)
     graph = parseRDFa(root, doc.uri, options=rdfaOptions)
     return self.process_graph(graph)
예제 #7
0
 def parse(self, doc):
     root = minidom.parse(doc)
     graph = parseRDFa(root, doc.uri, options=rdfaOptions)
     return self.process_graph(graph)
예제 #8
0
파일: pyRDFa.py 프로젝트: 15831944/virtuoso
def _processString(str, outputFormat, options, base, rdfOutput) :
	def __register_XML_serializer(formatstring) :
		"""The default XML Serializer of RDFlib is buggy, mainly when handling lists.
		An L{own version<serializers.PrettyXMLSerializer>} is 
		registered in RDFlib and used in the rest of the package.
		@param formatstring: the string to identify this serializer with.
		"""
		from rdflib.plugin import register
		from rdflib.syntax import serializer, serializers
		register(formatstring, serializers.Serializer, "pyRdfa.serializers.PrettyXMLSerializer", "PrettyXMLSerializer")

	def __register_Turtle_serializer(formatstring) :
		"""The default Turtle Serializers of RDFlib is buggy and not very nice as far as the output is concerned. 
		An L{own version<serializers.TurtleSerializer>} is registered in RDFLib and used in the rest of the package.
		@param formatstring: the string to identify this serializer with.
		"""
		from rdflib.plugin import register
		from rdflib.syntax import serializer, serializers
		register(formatstring, serializers.Serializer, "pyRdfa.serializers.TurtleSerializer", "TurtleSerializer")

	# Exchaning the pretty xml serializer agaist the version stored with this package
	if outputFormat == "pretty-xml"  :
		outputFormat = "my-xml"
		__register_XML_serializer(outputFormat)
	elif outputFormat == "turtle" or outputFormat == "n3" :
		outputFormat = "my-turtle"
		__register_Turtle_serializer(outputFormat)
		
	graph = Graph()
	msg = ""
	parse = xml.dom.minidom.parse
	stream = StringIO.StringIO (str)
	try :
		dom = parse(stream)
		# Try to second-guess the input type
		# This is _not_ really kosher, but the minidom is not really namespace aware...
		# In practice the goal is to have the system recognize svg content automatically
		# First see if there is a default namespace defined for the document:
		top = dom.documentElement
		if top.hasAttribute("xmlns") :
			key = (top.getAttribute("xmlns"),top.nodeName)
			if key in __switch :
				options.host_language = __switch[key]
	except :
		# XML Parsing error in the input
		(type,value,traceback) = sys.exc_info()
		if options.host_language == GENERIC_XML or options.lax == False :
			msg = 'Parsing error in input file: "%s"' % value
			raise RDFaError, msg
		else :
			# XML Parsing error in the input
			msg = 'XHTML Parsing error in input file: %s. Falling back on the HTML5 parser' % value
			
			if options != None and options.warnings : options.comment_graph.add_warning(msg)
			
			# note that if a urllib is used, the input has to be closed and reopened...
			# Now try to see if and HTML5 parser is an alternative...
			try :
				import html5lib
			except :
				# no alternative to the XHTML error, because HTML5 parser not available...
				msg2 = 'XHTML Parsing error in input file. Though parsing is lax, HTML5 parser not available' 
				raise RDFaError, msg2
				
			from html5lib import treebuilders
			parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
			parse = parser.parse
			try :
				dom = parse(stream)
				# The host language has changed
				options.host_language = HTML5_RDFA
			except :
				# Well, even the HTML5 parser could not do anything with this...
				(type,value,traceback) = sys.exc_info()
				msg2 = 'Parsing error in input file as HTML5: "%s"' % value
				msg3 = msg + '/n' + msg2
				raise RDFaError, msg3
	
	if base == "" :
		sbase = ""
	else :
		sbase = base
	parseRDFa(dom, sbase, graph, options)
	
	# Got all the graphs, serialize them
	
	try :
		if options.comment_graph.graph != None :
			# Add the content of the comment graph to the output
			graph.bind("dist",DIST_NS)
			for t in options.comment_graph.graph : graph.add(t)
		return graph.serialize(format=outputFormat)
	except :
		(type,value,traceback) = sys.exc_info()

		if rdfOutput :
			if base == "" : base = input
			return create_exception_graph("%s" % value, base, outputFormat, http=False)
		else :
			# re-raise the exception and let the caller deal with it...
			raise RDFaError("%s" % value)