Пример #1
0
"""
import json
import logging
rdflib_logger = logging.getLogger('rdflib')
rdflib_logger.setLevel(logging.ERROR)

from lxml.html import fromstring
from rdflib import Graph, logger as rdflib_logger
from rdflib.plugins.parsers.pyRdfa import pyRdfa as PyRdfa, Options, logger as pyrdfa_logger
from rdflib.plugins.parsers.pyRdfa.initialcontext import initial_context

from extruct.xmldom import XmlDomHTMLParser

# silence rdflib/PyRdfa INFO logs
rdflib_logger.setLevel(logging.ERROR)
pyrdfa_logger.setLevel(logging.ERROR)

initial_context["http://www.w3.org/2011/rdfa-context/rdfa-1.1"].ns.update({
    "twitter":
    "https://dev.twitter.com/cards#",
    "fb":
    "http://ogp.me/ns/fb#"
})


class RDFaExtractor(object):
    def extract(self,
                htmlstring,
                base_url=None,
                encoding="UTF-8",
                expanded=True):
Пример #2
0
Based on pyrdfa3 and rdflib
"""
import json
import logging
from xml.dom import Node
from xml.dom.minidom import Attr, NamedNodeMap

from lxml.etree import ElementBase, _ElementStringResult, _ElementUnicodeResult, XPath
from lxml.html import fromstring, HTMLParser, HtmlElementClassLookup
from rdflib import Graph, logger as rdflib_logger
from rdflib.plugins.parsers.pyRdfa import pyRdfa as PyRdfa, Options, logger as pyrdfa_logger
from rdflib.plugins.parsers.pyRdfa.initialcontext import initial_context

# silence rdflib/PyRdfa INFO logs
rdflib_logger.setLevel(logging.ERROR)
pyrdfa_logger.setLevel(logging.ERROR)

initial_context["http://www.w3.org/2011/rdfa-context/rdfa-1.1"].ns.update({
    "twitter": "https://dev.twitter.com/cards#",
    "fb": "http://ogp.me/ns/fb#"
})


class DomElementUnicodeResult(object):
    CDATA_SECTION_NODE = Node.CDATA_SECTION_NODE
    ELEMENT_NODE = Node.ELEMENT_NODE
    TEXT_NODE = Node.TEXT_NODE

    def __init__(self, text):
        self.text = text
        self.nodeType = Node.TEXT_NODE