Exemplo n.º 1
0
def parse_xml(xml_str_or_file, parser=None, **parser_kwargs):
    """ simple lxml parser to execute the correct parsing method and use ETree class as the default
        so we can use xpathbuilder directly without str casting

    Args:
        xml_str_or_file (str or IOBase):
        parser:
        **parser_kwargs:

    Returns:

    """
    if not parser:
        parser_lookup = ET.ElementDefaultClassLookup(
            element=ElementBaseXpathStr)
        parser = ET.XMLParser(**parser_kwargs)
        parser.set_element_class_lookup(parser_lookup)

    if isinstance(xml_str_or_file, str):
        return ET.fromstring(xml_str_or_file, parser=parser)
    elif isinstance(xml_str_or_file, bytes):
        return ET.parse(io.BytesIO(xml_str_or_file), parser=parser).getroot()
    elif hasattr(xml_str_or_file, 'read'):
        return ET.parse(xml_str_or_file, parser=parser).getroot()
    else:
        raise NotImplementedError(
            'We only know how to parse string, bytes or file objects.  Use straight lxml methods'
        )
Exemplo n.º 2
0
    def fromfile(self,
                 source,
                 tags_factory=Tag,
                 fragment=False,
                 no_leading_text=False,
                 encoding='utf-8',
                 **kw):
        """Parse a XML file

        In:
          - ``source`` -- can be a filename or a file object
          - ``fragment`` -- if ``True``, can parse a XML fragment i.e a XML without
            a unique root
          - ``no_leading_text`` -- if ``fragment`` is ``True``, ``no_leading_text``
            is ``False`` and the XML to parsed begins by a text, this text is keeped
          - ``kw`` -- keywords parameters are passed to the XML parser

        Return:
          - the root element of the parsed XML, if ``fragment`` is ``False``
          - a list of XML elements, if ``fragment`` is ``True``
        """
        if isinstance(source, (str, type(u''))):
            if source.startswith(('http://', 'https://', 'ftp://')):
                source = urlopen(source)
            else:
                source = fileopen(source, encoding=encoding)

        # Create a dedicated parser with the ``kw`` parameter
        parser = self._parser.__class__(encoding=encoding, **kw)
        # This parser will generate nodes of type ``Tag``
        parser.set_element_class_lookup(
            etree.ElementDefaultClassLookup(element=tags_factory))

        if not fragment:
            # Parse a tree (only one root)
            # ----------------------------

            root = etree.parse(source, parser).getroot()
            source.close()

            # Attach the renderer to the root
            root._renderer = self
            return root

        # Parse a fragment (multiple roots)
        # ---------------------------------

        # Create a dummy root
        xml = BufferIO(b'<html><body>%s</body></html>' % source.read())
        source.close()

        root = etree.parse(xml, parser).getroot()[0]
        for e in root:
            if isinstance(e, tags_factory):
                # Attach the renderer to each roots
                e._renderer = self

        # Return the children of the dummy root
        return ((root.text.encode(encoding), )
                if root.text and not no_leading_text else ()) + tuple(root[:])
Exemplo n.º 3
0
    def parse_html(self,
                   source,
                   fragment=False,
                   no_leading_text=False,
                   xhtml=False,
                   **kw):
        """Parse a (X)HTML file

        In:
          - ``source`` -- can be a filename or a file object
          - ``fragment`` -- if ``True``, can parse a HTML fragment i.e a HTML without
            a unique root
          - ``no_leading_text`` -- if ``fragment`` is ``True``, ``no_leading_text``
            is ``False`` and the HTML to parsed begins by a text, this text is keeped
          - ``xhtml`` -- is the HTML to parse a valid XHTML ?
          - ``kw`` -- keywords parameters are passed to the HTML parser

        Return:
          - the root element of the parsed HTML, if ``fragment`` is ``False``
          - a list of HTML elements, if ``fragment`` is ``True``
        """
        parser = ET.XMLParser(**kw) if xhtml else ET.HTMLParser(**kw)
        parser.setElementClassLookup(
            ET.ElementDefaultClassLookup(element=_HTMLTag))

        return self._parse_html(parser, source, fragment, no_leading_text,
                                **kw)
Exemplo n.º 4
0
class Renderer(xml.XmlRenderer):

    doctype = '<!DOCTYPE document SYSTEM "rml.dtd">'
    content_type = 'application/pdf'
    namespace = 'http://namespaces.zope.org/rml'

    _parser = etree.XMLParser()
    _parser.set_element_class_lookup(etree.ElementDefaultClassLookup(element=Tag))

    def __init__(self, parent=None, *args, **kw):
        super(Renderer, self).__init__(parent, *args, **kw)

        self.namespaces = {None: self.namespace}

    @classmethod
    def get_tags(cls, tags, tag, signature):
        if tag not in tags:
            tags[tag] = set(schema.getFields(signature))

            for child in signature.queryTaggedValue('directives', ()):
                cls.get_tags(tags, child.tag, child.signature)

        return tags

    @classmethod
    def create_RML_tags(cls):
        for tag, signature in cls.get_tags({}, 'document', document.IDocument).items():
            setattr(cls, tag, xml.TagProp(tag, signature))

    @classmethod
    def create_para_extra_tags(cls):
        tags = [method[8:] for method in Para.__dict__ if method.startswith('compile_')]

        for tag in tags:
            setattr(cls, tag, xml.TagProp(tag))
Exemplo n.º 5
0
Arquivo: api.py Projeto: DolphDev/sans
    async def __aiter__(self) -> _AsyncGenerator[NSElement, None]:
        if not Api.agent:
            raise RuntimeError("The API's user agent is not yet set.")

        url = self.value
        # pylint: disable=E1101
        tag = self.name.upper().rstrip("S")

        parser = etree.XMLPullParser(["end"],
                                     base_url=url,
                                     remove_blank_text=True,
                                     tag=tag)
        parser.set_element_class_lookup(
            etree.ElementDefaultClassLookup(element=NSElement))
        events = parser.read_events()
        dobj = zlib.decompressobj(16 + zlib.MAX_WBITS)

        async with Api.session.request("GET",
                                       url,
                                       headers={"User-Agent":
                                                Api.agent}) as response:
            async for data, _ in response.content.iter_chunks():
                parser.feed(dobj.decompress(data))
                for _, element in events:
                    yield element
                    element.clear()
                    while element.getparent(
                    ) is not None and element.getprevious() is not None:
                        del element.getparent()[0]
Exemplo n.º 6
0
 def createDefaultParser(self):
     parser = _etree.XMLParser(**self.parser_config)
     element_class = self.element_class
     if self.element_class is not None:
         lookup = _etree.ElementDefaultClassLookup(element=element_class)
         parser.set_element_class_lookup(lookup)
     return parser
Exemplo n.º 7
0
    def __getattr__(cls, name: str) -> str:
        """
        Magic method to call a HTML element.

        Args:
            name: The HTML element name

        Returns:
            HTML element

        Raises:
            AttributeError: If an HTML element name is invalid
        """
        if name == '__elements__' or name not in cls.__elements__:
            raise AttributeError('Invalid element')

        parser = etree.HTMLParser()
        parser.set_element_class_lookup(
            etree.ElementDefaultClassLookup(element=HtmlElement))

        def _wrapper(*args, **kwargs):
            return \
                getattr(builder.ElementMaker(makeelement=parser.makeelement),
                        name)(*args, **kwargs)

        return _wrapper
Exemplo n.º 8
0
    def createFromExisting(cls, source):
        """
        Create a TouchOSCLayout instance from an existing TouchOSC Layout.

        @type source: filename or fileobject 
        @param source: Path to an existing .touchosc file, or 
            TouchOSC index.xml file (from unzipping .touchosc file)
        @rtype: Layout 
        @return: An instance containing the layout 
        """
        fallback = etree.ElementDefaultClassLookup()
        lookupTabpages = etree.ElementNamespaceClassLookup(fallback)
        namespace = lookupTabpages.get_namespace(None)
        namespace['tabpage'] = tabpage.Tabpage
        lookupControls = etree.AttributeBasedElementClassLookup(
            'type', controls.type_class_mapping, lookupTabpages)
        layoutParser = etree.XMLParser(remove_blank_text=True)
        layoutParser.setElementClassLookup(lookupControls)

        if type(source) is str:
            (path, fname) = os.path.split(source)
            (name, extension) = os.path.splitext(fname)
            if extension == ".touchosc":
                f = ZipFile(source, "r")
                layoutTree = etree.parse(StringIO(f.read("index.xml")),
                                         layoutParser)
                f.close()
            elif extension == ".xml":
                name = None
                layoutTree = etree.parse(source, layoutParser)
        return Layout(layoutTree, name)
Exemplo n.º 9
0
def parseXML(filename):
    """ Parse an XML document, thus also suitable for XHTML """
    # XML doesn't require jumping through the same hoops as HTML since there
    # are no existing custom element classes.
    parser_lookup = _etree.ElementDefaultClassLookup(element=HtHtmlElement)
    parser = _etree.XMLParser()
    parser.set_element_class_lookup(parser_lookup)
    return _etree.parse(filename, parser=parser)
Exemplo n.º 10
0
def Element(name, attrib={}, **extra):
    attrib = attrib.copy()
    attrib.update(extra)
    parser_lookup = etree.ElementDefaultClassLookup(element=MyWriter)
    parser = etree.XMLParser()
    parser.set_element_class_lookup(parser_lookup)
    tag = parser.makeelement(name, attrib=attrib)
    return tag
Exemplo n.º 11
0
    def class_init(cls, special_tags):
        """Class initialisation

        In:
          -- ``special_tags`` -- tags that have a special factory
        """
        # Create a XML parser that generate ``_Tag`` nodes
        cls._xml_parser = ET.XMLParser()
        cls._xml_parser.setElementClassLookup(
            ET.ElementDefaultClassLookup(element=_Tag))
Exemplo n.º 12
0
    def parse_xml(self, source, fragment=False, no_leading_text=False, **kw):
        """Parse a XML file

        In:
          - ``source`` -- can be a filename or a file object
          - ``fragment`` -- if ``True``, can parse a XML fragment i.e a XML without
            a unique root
          - ``no_leading_text`` -- if ``fragment`` is ``True``, ``no_leading_text``
            is ``False`` and the XML to parsed begins by a text, this text is keeped
          - ``kw`` -- keywords parameters are passed to the XML parser

        Return:
          - the root element of the parsed XML, if ``fragment`` is ``False``
          - a list of XML elements, if ``fragment`` is ``True``
        """
        if isinstance(source, basestring):
            if source.startswith(('http://', 'https://', 'ftp://')):
                source = urllib.urlopen(source)
            else:
                source = open(source)

        # Create a dedicated XML parser with the ``kw`` parameter
        parser = ET.XMLParser(**kw)
        # This parser will generate nodes of type ``_Tag``
        parser.setElementClassLookup(
            ET.ElementDefaultClassLookup(element=_Tag))

        if not fragment:
            # Parse a XML file
            # ----------------

            root = ET.parse(source, parser).getroot()
            source.close()

            # Attach the renderer to the root
            root._renderer = self
            return root

        # Parse a XML fragment
        # --------------------

        # Create a dummy root
        xml = cStringIO.StringIO('<dummy>%s</dummy>' % source.read())
        source.close()

        root = ET.parse(xml, parser).getroot()
        for e in root[:]:
            # Attach the renderer to each roots
            e._renderer = self

        # Return the children of the dummy root
        return ([root.text]
                if root.text and not no_leading_text else []) + root[:]
Exemplo n.º 13
0
def create_tag(name, value, attrib={}, **extra):
    #         print ("tworze tag %s o wartości %s"%(name,value))
    attrib = attrib.copy()
    attrib.update(extra)
    parser_lookup = etree.ElementDefaultClassLookup(element=MyWriter)
    parser = etree.XMLParser()
    parser.set_element_class_lookup(parser_lookup)
    tag = parser.makeelement(name, attrib=attrib)
    if value:
        #             tag.text = value.decode("utf-8")
        tag.text = value
        return tag
Exemplo n.º 14
0
    def __init__(self, file_location):
        """
        Parser/iterator for the OAIRecord class. Iterates over record elements in any namespace (repox or oai-pmh).

        :param file_location:
        """
        oai_parser_registration = etree.ElementDefaultClassLookup(
            element=OAIRecord)
        oai_parser = etree.XMLParser()
        oai_parser.set_element_class_lookup(oai_parser_registration)
        super(OAIReader, self).__init__(file_location,
                                        '{*}record',
                                        parser=oai_parser)
Exemplo n.º 15
0
    def __init__(self, file_location):
        """
        Parser/iterator for the MODSRecord class. Iterates on mods:mods elements.

        :param file_location:
        """
        mods_parser_registration = etree.ElementDefaultClassLookup(
            element=MODSRecord)
        mods_parser = etree.XMLParser()
        mods_parser.set_element_class_lookup(mods_parser_registration)
        super(MODSReader, self).__init__(file_location,
                                         '{0}mods'.format(NAMESPACES['mods']),
                                         parser=mods_parser)
Exemplo n.º 16
0
    def metadata(self):
        """
        Exposes the metadata content of an OAIRecord.

        :return: A reparsed root element either in the MODSRecord or DCRecord class, as appropriate.
        """
        record_data = self.find('./{*}metadata')
        if record_data is not None:
            try:
                if 'mods' in record_data[0].tag:
                    mods_parser_registration = etree.ElementDefaultClassLookup(
                        element=MODSRecord)
                    mods_parser = etree.XMLParser()
                    mods_parser.set_element_class_lookup(
                        mods_parser_registration)
                    return etree.XML(etree.tostring(
                        record_data[0], encoding='UTF-8').decode('utf-8'),
                                     parser=mods_parser)
                elif 'qualified' in record_data[0].tag:
                    qdc_parser_registration = etree.ElementDefaultClassLookup(
                        element=DCRecord)
                    qdc_parser = etree.XMLParser()
                    qdc_parser.set_element_class_lookup(
                        qdc_parser_registration)
                    return etree.XML(etree.tostring(
                        record_data[0], encoding='UTF-8').decode('utf-8'),
                                     parser=qdc_parser)
                elif 'dc' in record_data[0].tag:
                    dc_parser_registration = etree.ElementDefaultClassLookup(
                        element=DCRecord)
                    dc_parser = etree.XMLParser()
                    dc_parser.set_element_class_lookup(dc_parser_registration)
                    return etree.XML(etree.tostring(
                        record_data[0], encoding='UTF-8').decode('utf-8'),
                                     parser=dc_parser)
            except IndexError:
                pass
Exemplo n.º 17
0
    def _build(self):
        """ create layout and define widget attribute by tkouter html """
        if not self.layout:
            return

        env = Environment(loader=self.loader)
        if '.html' in self.layout or 'xml' in self.layout:
            template = env.get_template(self.layout)
            self._html = template.render(self.context)
        else:
            self._html = Template(self.layout).render(self.context)

        # lxml parser
        parser_lookup = etree.ElementDefaultClassLookup(element=TkOutElement)
        self._parser = etree.XMLParser()
        self._parser.set_element_class_lookup(parser_lookup)
        self._tree = etree.parse(StringIO(self._html), self._parser)

        # we should cache the elements for storing data to it
        self._proxy_cache = list(self._tree.getroot().iter())

        # css
        css = None
        for e in self._tree.getroot().iter():
            if e.is_css and e.get('href'):
                self._css = env.get_template(e.get('href')).render()
                self._css_parser = tinycss.make_parser()
                self._stylesheet = self._css_parser.parse_stylesheet(self._css)
                for rule in self._stylesheet.rules:
                    for e in self._select(rule.selector.as_css()):
                        for d in rule.declarations:
                            if e.get(d.name) is None:
                                e.set(d.name, d.value.as_css())

        # post init etree elements and display their widgets
        for e in self._tree.getroot().iter():
            try:
                e.init(self)
                e.display()
            except TagError as err:
                print('Error when parsing tag: ')
                print(
                    etree.tostring(e,
                                   pretty_print=True,
                                   encoding=str,
                                   method='html'))
                raise err
Exemplo n.º 18
0
    def set_parser_to_relaxed(cls):
        """
        Creates a XML parser which attempts to recover syntactically-flawed XML.

        Returns:
            None
        """
        # Creates an `etree.XMLParser` object, equivalent to the default parser used
        # by the parser `lxml.fromstring()` (see `lxml.GlobalParserTLS.createDefaultParser()`),
        # except enabling the `recover=True` attribute.

        relaxed_xml_parser = etree.XMLParser(recover=True,
                                             resolve_entities=False)
        lookup = etree.ElementDefaultClassLookup(element=RestrictedElement)
        relaxed_xml_parser.set_element_class_lookup(lookup)
        # Inject parser
        cls._parse_etree = partial(super()._parse_etree,
                                   parser=relaxed_xml_parser)
Exemplo n.º 19
0
    def class_init(cls, specialTags):
        """Class initialisation

        In:
          -- ``special_tags`` -- tags that have a special factory
        """
        class CustomLookup(ET.CustomElementClassLookup):
            def __init__(self, specialTags, defaultLookup):
                super(CustomLookup, self).__init__(defaultLookup)
                self._specialTags = specialTags

            def lookup(self, node_type, document, namespace, name):
                return self._specialTags.get(name)

        cls._specialTags.update(specialTags)

        cls._custom_lookup = CustomLookup(cls._specialTags, ET.ElementDefaultClassLookup(element=xhtml_base._HTMLTag))
        cls._html_parser = ET.HTMLParser()
        cls._html_parser.setElementClassLookup(cls._custom_lookup)
Exemplo n.º 20
0
    async def __aiter__(self):
        url = self.value

        parser = etree.XMLPullParser(["end"],
                                     base_url=url,
                                     remove_blank_text=True)
        parser.set_element_class_lookup(
            etree.ElementDefaultClassLookup(element=_NSElement))
        events = parser.read_events()
        dobj = zlib.decompressobj(16 + zlib.MAX_WBITS)

        async with Api.session.request("GET",
                                       url,
                                       headers={"User-Agent":
                                                Api.agent}) as response:
            yield parser.makeelement("HEADERS", attrib=response.headers)
            async for data, _ in response.content.iter_chunks():
                parser.feed(dobj.decompress(data))
                for _, element in events:
                    yield element
                    element.clear()
Exemplo n.º 21
0
Arquivo: api.py Projeto: DolphDev/sans
    async def __aiter__(
            self,
            *,
            no_clear: bool = False) -> _AsyncGenerator[NSElement, None]:
        if not self.agent:
            raise RuntimeError("The API's user agent is not yet set.")
        if not self:
            # Preempt the request to conserve ratelimit
            raise ValueError("Bad request")
        if "a" in self and self["a"].lower() == "sendtg":
            raise RuntimeError(
                "This API wrapper does not support API telegrams.")
        url = str(self)

        parser = etree.XMLPullParser(["end"],
                                     base_url=url,
                                     remove_blank_text=True)
        parser.set_element_class_lookup(
            etree.ElementDefaultClassLookup(element=NSElement))
        events = parser.read_events()

        async with self.session.request("GET",
                                        url,
                                        headers={"User-Agent":
                                                 self.agent}) as response:
            encoding = response.headers["Content-Type"].split(
                "charset=")[1].split(",")[0]
            async for data, _ in response.content.iter_chunks():
                parser.feed(data.decode(encoding))
                for _, element in events:
                    if not no_clear and (element.getparent() is None
                                         or element.getparent().getparent()
                                         is not None):
                        continue
                    yield element
                    if no_clear:
                        continue
                    element.clear()
                    while element.getprevious() is not None:
                        del element.getparent()[0]
Exemplo n.º 22
0
    async def __aiter__(self, *, clear: bool = True):
        if not self:
            raise ValueError("Bad request")
        url = str(self)

        parser = etree.XMLPullParser(["end"],
                                     base_url=url,
                                     remove_blank_text=True)
        parser.set_element_class_lookup(
            etree.ElementDefaultClassLookup(element=_NSElement))
        events = parser.read_events()

        async with type(self).session.request(
                "GET", url, headers={"User-Agent":
                                     type(self).agent}) as response:
            yield parser.makeelement("HEADERS", attrib=response.headers)
            encoding = response.headers["Content-Type"].split(
                "charset=")[1].split(",")[0]
            async for data, _ in response.content.iter_chunks():
                parser.feed(data.decode(encoding))
                for _, element in events:
                    yield element
                    if clear:
                        element.clear()
Exemplo n.º 23
0
 def __init__(self):
     self.parser = etree.XMLParser()
     fallback = etree.ElementDefaultClassLookup(PDFXML)
     lookup = etree.ElementNamespaceClassLookup(fallback)
     namespace = lookup.get_namespace(None)
     #leafs
     namespace['name'] = PDFName
     namespace['string'] = PDFString
     namespace['number'] = PDFNumber
     namespace['null'] = PDFNull
     namespace['bool'] = PDFBool
     namespace['R'] = PDFR
     namespace['header'] = PDFHeader
     namespace['startxref'] = PDFStartxref
     namespace['data'] = PDFData
     #trees
     namespace['entry'] = PDFEntry
     namespace['dictionary'] = PDFDictionary
     namespace['stream'] = PDFStream
     namespace['pdf'] = PDFPdf
     namespace['pdf_update'] = PDFUpdate
     namespace['indirect_object'] = PDFIndirect
     namespace['array'] = PDFArray
     self.parser.set_element_class_lookup(lookup)
Exemplo n.º 24
0
def main(benchmark_class):
    import_lxml = True
    callgrind_zero = False
    if len(sys.argv) > 1:
        try:
            sys.argv.remove('-i')
            # run benchmark 'inplace'
            sys.path.insert(0, 'src')
        except ValueError:
            pass

        try:
            sys.argv.remove('-nolxml')
            # run without lxml
            import_lxml = False
        except ValueError:
            pass

        try:
            sys.argv.remove('-z')
            # reset callgrind after tree setup
            callgrind_zero = True
        except ValueError:
            pass

        initArgs(sys.argv)

    _etrees = []
    if import_lxml:
        from lxml import etree
        _etrees.append(etree)

        try:
            sys.argv.remove('-fel')
        except ValueError:
            pass
        else:
            # use fast element creation in lxml.etree
            etree.set_element_class_lookup(etree.ElementDefaultClassLookup())

    if len(sys.argv) > 1:
        if '-a' in sys.argv or '-c' in sys.argv:
            # 'all' or 'C-implementations' ?
            try:
                sys.argv.remove('-c')
            except ValueError:
                pass
            try:
                import cElementTree as cET
                _etrees.append(cET)
            except ImportError:
                try:
                    import xml.etree.cElementTree as cET
                    _etrees.append(cET)
                except ImportError:
                    pass

        try:
            # 'all' ?
            sys.argv.remove('-a')
        except ValueError:
            pass
        else:
            try:
                from elementtree import ElementTree as ET
                _etrees.append(ET)
            except ImportError:
                try:
                    from xml.etree import ElementTree as ET
                    _etrees.append(ET)
                except ImportError:
                    pass

    if not _etrees:
        print("No library to test. Exiting.")
        sys.exit(1)

    print("Preparing test suites and trees ...")
    selected = set(sys.argv[1:])
    benchmark_suites, benchmarks = \
                      buildSuites(benchmark_class, _etrees, selected)

    print("Running benchmark on",
          ', '.join(b.lib_name for b in benchmark_suites))
    print('')

    printSetupTimes(benchmark_suites)

    if callgrind_zero:
        cmd = open("callgrind.cmd", 'w')
        cmd.write('+Instrumentation\n')
        cmd.write('Zero\n')
        cmd.close()

    runBenchmarks(benchmark_suites, benchmarks)
Exemplo n.º 25
0
class RDFXMLReader:
    CORE_SYNTAX_TERMS = {
        RDF.RDF, RDF.ID, RDF.about, RDF.parseType, RDF.resource, RDF.nodeID,
        RDF.datatype
    }
    SYNTAX_TERMS = CORE_SYNTAX_TERMS | {RDF.Description, RDF.li}
    OLD_TERMS = {RDF.aboutEach, RDF.aboutEachPrefix, RDF.bagID}
    XML_TERMS = {XML.base, XML.lang}

    ILLEGAL_NODE_TAGS = CORE_SYNTAX_TERMS | {RDF.li} | OLD_TERMS
    ILLEGAL_PROPERTY_TAGS = CORE_SYNTAX_TERMS | {RDF.Description} | OLD_TERMS
    ILLEGAL_PROPERTY_ATTRS = SYNTAX_TERMS | OLD_TERMS

    _PARSER_LOOKUP = etree.ElementDefaultClassLookup(element=Element)

    def __init__(self, parser=None):
        if parser is None:
            parser = etree.XMLParser(remove_comments=True, remove_pis=True)
            parser.set_element_class_lookup(self._PARSER_LOOKUP)
        self.parser = parser

    def read(self, lines, base_uri=None):
        root = etree.parse(lines, self.parser, base_url=base_uri).getroot()
        ids = set()
        # rdf:RDF is not necessarily the root element.
        for element in root if root.uri == RDF.RDF else [root]:
            self._validate(element)
            for triple in self._node_element(element, ids):
                yield triple

    def _validate(self, element):
        for attr, value in element.items():
            attr = QName(attr)
            # Ignore unknown and reserved XML attributes.
            if attr.namespace is None or (attr.namespace == XML
                                          and attr not in _XML_ATTRS):
                del element.attrib[attr]
            # Validate but ignore old syntax terms.
            elif attr == QName(RDF, 'bagID'):
                if not _NCNAME.match(value):
                    raise ParseError(
                        "rdf:bagID does not match NCName: {!r}".format(value))
                del element.attrib[attr]
            elif attr in _OLD_ATTRS:
                raise ParseError

    def _node_element(self, element, ids):
        # 7.2.11 Production nodeElement
        self._validate(element)
        if element.uri in self.ILLEGAL_NODE_TAGS:
            raise ParseError("Illegal node element: {!s}".format(element.tag))

        element.subject = self._subject(element, ids)

        # 2.13 Typed Node Elements
        if element.uri != RDF.Description:
            yield (element.subject, RDF.type, element.uri)

        for triple in self._property_attrs(element):
            yield triple

        for triple in self._property_elements(element, ids):
            yield triple

    def _subject(self, element, ids):
        id_ = self._id(element, ids)
        node_id = element.get(QName(RDF, 'nodeID'))
        about = element.get(QName(RDF, 'about'))
        if id_ is not None:
            if node_id is None:
                if about is None:
                    return id_
                raise ParseError
            raise ParseError
        elif node_id is not None:
            if about is None:
                if _NCNAME.match(node_id):
                    return BlankNode(node_id)
                raise ParseError
            raise ParseError
        elif about is not None:
            return self._uri(about, element.base_uri)
        return BlankNode()

    def _uri(self, uri, base_uri=None):
        if base_uri and not uri:
            base_uri = base_uri.rsplit('#', 1)[0]
        return URI(urllib.parse.urljoin(base_uri or '', uri))

    def _id(self, element, ids):
        name = element.get(QName(RDF, 'ID'))
        if name is not None:
            if _NCNAME.match(name):
                uri = self._uri('#' + name, element.base_uri)
                if uri not in ids:
                    ids.add(uri)
                    return uri
                else:
                    raise ParseError("rdf:ID is not unique: {!r}".format(uri))
            else:
                raise ParseError(
                    "rdf:ID does not match NCName: {!r}".format(name))

    def _property_attrs(self, element):
        # 2.5 Property Attributes
        for attr, value in element.items():
            if attr not in _XML_ATTRS:
                predicate = URI(QName(attr))
                if predicate not in self.ILLEGAL_PROPERTY_ATTRS:
                    if predicate != RDF.type:
                        object_ = PlainLiteral(value, element.language)
                    else:
                        object_ = URI(value)
                    yield (element.subject, predicate, object_)
                elif predicate == RDF.li:
                    raise ParseError("rdf:li is not allowed as attribute")

    def _property_elements(self, parent, ids):
        # 7.2.13 Production propertyEltList
        li_counter = 1
        for element in parent:
            # 7.2.14 Production propertyElt
            self._validate(element)
            if element.uri in self.ILLEGAL_PROPERTY_TAGS:
                raise ParseError("Illegal property element: {!s}".format(
                    element.tag))
            elif element.uri == RDF.li:
                # Container Membership Property Elements: rdf:li and rdf:_n
                element.uri = RDF['_' + str(li_counter)]
                li_counter += 1

            parse_type = element.attrib.get(QName(RDF, 'parseType'))
            legal_attrs = _XML_ATTRS | {QName(RDF, 'ID')}
            if parse_type is not None:
                legal_attrs.add(QName(RDF, 'parseType'))
                if any(attr not in legal_attrs for attr in element.keys()):
                    raise ParseError
                elif parse_type == 'Resource':
                    triples = self._parse_type_resource_property(
                        element, parent, ids)
                elif parse_type == 'Collection':
                    triples = self._parse_type_collection_property(
                        element, parent, ids)
                else:
                    triples = self._parse_type_literal_property(
                        element, parent, ids)
            elif len(element) == 1:
                if all(attr not in legal_attrs for attr in element.keys()):
                    triples = self._resource_property(element, parent, ids)
                else:
                    raise ParseError
            elif len(element) == 0:
                if element.text:
                    legal_attrs.add(QName(RDF, 'datatype'))
                    if all(attr in legal_attrs for attr in element.keys()):
                        triples = self._literal_property(element, parent, ids)
                    else:
                        raise ParseError
                else:
                    triples = self._empty_property(element, parent, ids)
            for triple in triples:
                yield triple

    def _reify(self, uri, triple):
        yield (uri, RDF.type, RDF.Statement)
        yield (uri, RDF.subject, triple[0])
        yield (uri, RDF.predicate, triple[1])
        yield (uri, RDF.object, triple[2])

    def _resource_property(self, element, parent, ids):
        # 7.2.15 Production resourcePropertyElt
        node_element = element[0]
        for triple in self._node_element(node_element, ids):
            yield triple
        triple = (parent.subject, element.uri, node_element.subject)
        yield triple
        id_ = self._id(element, ids)
        if id_ is not None:
            # 7.3 Reification Rules
            for triple in self._reify(id_, triple):
                yield triple

    def _literal_property(self, element, parent, ids):
        # 7.2.16 Production literalPropertyElt
        datatype = element.get(QName(RDF, 'datatype'))
        if datatype is not None:
            object_ = TypedLiteral(element.text, URI(datatype))
        else:
            object_ = PlainLiteral(element.text, element.language)
        triple = (parent.subject, element.uri, object_)
        yield triple
        id_ = self._id(element, ids)
        if id_ is not None:
            # 7.3 Reification Rules
            for triple in self._reify(id_, triple):
                yield triple

    def _parse_type_resource_property(self, element, parent, ids):
        # 7.2.18 Production parseTypeResourcePropertyElt
        node_element = element.makeelement(QName(RDF, 'Description'))
        node_element[:] = element
        for triple in self._node_element(node_element, ids):
            yield triple
        triple = (parent.subject, element.uri, node_element.subject)
        yield triple
        id_ = self._id(element, ids)
        if id_ is not None:
            # 7.3 Reification Rules
            for triple in self._reify(id_, triple):
                yield triple

    def _parse_type_collection_property(self, element, parent, ids):
        # 7.2.19 Production parseTypeCollectionPropertyElt
        node_ids = []
        for node_element in element:
            for triple in self._node_element(node_element, ids):
                yield triple
            node_ids.append((node_element, BlankNode()))
        for node_element, object_ in node_ids:
            break
        else:
            object_ = RDF.nil
        triple = (parent.subject, element.uri, object_)
        yield triple
        id_ = self._id(element, ids)
        if id_ is not None:
            # 7.3 Reification Rules
            for triple in self._reify(id_, triple):
                yield triple
        for i, (node_element, object_) in enumerate(node_ids):
            yield (object_, RDF.first, node_element.subject)
            try:
                next_pair = node_ids[i + 1]
            except IndexError:
                next_object = RDF.nil
            else:
                next_element, next_object = next_pair
            yield (object_, RDF.rest, next_object)

    def _parse_type_literal_property(self, element, parent, ids):
        literal = element.text or ""
        if len(element):
            tree = etree.ElementTree(element[0])
            bytes_io = BytesIO()
            tree.write_c14n(bytes_io, exclusive=True, with_comments=True)
            literal += bytes_io.getvalue().decode('utf-8')
            literal += element[0].tail or ""
        object_ = TypedLiteral(literal, RDF.XMLLiteral)
        triple = (parent.subject, element.uri, object_)
        yield triple
        id_ = self._id(element, ids)
        if id_ is not None:
            # 7.3 Reification Rules
            for triple in self._reify(id_, triple):
                yield triple

    def _empty_property(self, element, parent, ids):
        # 7.2.21 Production emptyPropertyElt
        id_ = self._id(element, ids)
        literal_attrs = _XML_ATTRS | {QName(RDF, 'ID')}
        if all(attr in literal_attrs for attr in element.keys()):
            object_ = PlainLiteral("", element.language)
            triple = (parent.subject, element.uri, object_)
            yield triple
            if id_ is not None:
                for triple in self._reify(id_, triple):
                    yield triple
        else:
            resource = element.attrib.get(QName(RDF, 'resource'))
            node_id = element.attrib.get(QName(RDF, 'nodeID'))
            if resource is not None:
                if node_id is None:
                    object_ = self._uri(resource, element.base_uri)
                else:
                    raise ParseError
            elif node_id is not None:
                if _NCNAME.match(node_id):
                    object_ = BlankNode(node_id)
                else:
                    raise ParseError(
                        "rdf:nodeID does not match NCName: {!r}".format(
                            node_id))
            else:
                object_ = BlankNode()
            triple = (parent.subject, element.uri, object_)
            yield triple
            if id_ is not None:
                for triple in self._reify(id_, triple):
                    yield triple
            subject = object_
            property_attrs = set(element.keys())
            property_attrs -= literal_attrs | {
                QName(RDF, 'resource'),
                QName(RDF, 'nodeID')
            }
            for attr in property_attrs:
                predicate = URI(QName(attr))
                if predicate in self.XML_TERMS:
                    continue
                elif predicate in self.ILLEGAL_PROPERTY_ATTRS:
                    raise ParseError
                value = element.get(attr)
                if predicate != RDF.type:
                    object_ = PlainLiteral(value, element.language)
                else:
                    object_ = self._uri(value, element.base_uri)
                yield (subject, predicate, object_)
Exemplo n.º 26
0
def HTMLParser(*args, **kwargs):
    lookup = etree.ElementDefaultClassLookup(element=HTMLElement)
    parser = etree.HTMLParser(*args, **kwargs)
    parser.set_element_class_lookup(lookup)
    return parser
Exemplo n.º 27
0
def HTMLParser(*args, **kwargs):
    kwargs.setdefault('encoding', 'utf-8')
    lookup = etree.ElementDefaultClassLookup(element=HTMLElement)
    parser = etree.HTMLParser(*args, **kwargs)
    parser.set_element_class_lookup(lookup)
    return parser
Exemplo n.º 28
0
        return True


class ExactlyOneError(ValueError):
    pass


def one(mylist):
    """
    assert that there's only one thing, and get it.
    """
    if len(mylist) != 1:
        raise ExactlyOneError(
            'Expected exactly one item. Got %i: %r' % (
                len(mylist),
                [
                    item.tostring()
                    if isinstance(item, etree.ElementBase)
                    else item
                    for item in mylist
                ]
            )
        )

    return mylist[0]


node_lookup = etree.ElementDefaultClassLookup(element=RefactorLibNodeBase)

__all__ = ('RefactorLibNodeBase',)
Exemplo n.º 29
0
    @staticmethod
    def to_string(element):
        if callable(element):
            element = element()

        return etree.tostring(
            element,
            encoding='utf-8',
            xml_declaration=True,
            pretty_print=True,
        )


XMLParserLookup = etree.ElementNamespaceClassLookup(
    fallback=etree.ElementDefaultClassLookup(element=ElementBase))

XMLParser = etree.XMLParser(encoding='utf-8', no_network=False)
XMLParser.set_element_class_lookup(XMLParserLookup)

E = ElementMaker(
    nsmap=SOAP_NSMAP,
    makeelement=XMLParser.makeelement,
)

S = ElementMaker(
    namespace=SOAP_ENV_URI,
    nsmap=SOAP_NSMAP,
    makeelement=XMLParser.makeelement,
)
Exemplo n.º 30
0
# create etree parser using custom Element class


class LayoutElement(etree.ElementBase):
    @property
    def layout(self):
        if not hasattr(self, '_layout'):
            self._layout = None
        return self._layout

    @layout.setter
    def layout(self, value):
        self._layout = value


parser_lookup = etree.ElementDefaultClassLookup(element=LayoutElement)
parser = etree.XMLParser()
parser.set_element_class_lookup(parser_lookup)


# main class
class PDFQuery(object):
    def __init__(
        self,
        file,
        merge_tags=('LTChar', 'LTAnno'),
        round_floats=True,
        round_digits=3,
        input_text_formatter=None,
        normalize_spaces=True,
        resort=True,