Exemplo n.º 1
0
    def process_node(self, session, data):

        # Turn into SAX and process_eventList() for the mean time
        handler = SaxContentHandler()
        sax.saxify(data, handler)
        saxl = handler.currentText
        return self.process_eventList(session, saxl)
Exemplo n.º 2
0
def parse_document(file_name):
    tree = etree.parse(file_name)
    handler = ErdmanTransformer()
    saxify(tree, handler)
    handler.save_page()  # get the last page
    titles = get_titles(tree)
    return handler, titles
Exemplo n.º 3
0
def parse_document(file_name):
    tree = etree.parse(file_name)
    handler = ErdmanTransformer()
    saxify(tree, handler)
    handler.save_page()  # get the last page
    titles = get_titles(tree)
    return handler, titles
Exemplo n.º 4
0
def fuck_dom(page):
    page = UnicodeDammit(page).unicode_markup
    tree = etree.fromstring(page,etree.HTMLParser())
    #tree.docinfo.encoding = "utf-8"
    handler = SAX2DOM()
    sax.saxify(tree, handler)
    return handler.document
Exemplo n.º 5
0
 def get_sax(self, session):
     if (not self.sax):
         handler = SaxContentHandler()
         sax.saxify(self.dom, handler)
         self.sax = handler.currentText
         self.sax.append("9 %r" % handler.elementHash)
     return self.sax
Exemplo n.º 6
0
 def get_sax(self, session):
     if (not self.sax):
         handler = SaxContentHandler()
         sax.saxify(self.dom, handler)
         self.sax = handler.currentText
         self.sax.append("9 %r" % handler.elementHash)
     return self.sax
Exemplo n.º 7
0
    def process_node(self, session, data):

        # Turn into SAX and process_eventList() for the mean time
        handler = SaxContentHandler()
        sax.saxify(data, handler)
        saxl= handler.currentText
        return self.process_eventList(session, saxl)
Exemplo n.º 8
0
    def test_sax_to_pulldom_multiple_namespaces(self):
        tree = self.parse('<a xmlns="blaA" xmlns:a="blaA"></a>')
        handler = pulldom.SAX2DOM()
        sax.saxify(tree, handler)
        dom = handler.document

        # With multiple prefix definitions, the node should keep the one
        # that was actually used, even if the others also are valid.
        self.assertEqual('a',
                         dom.firstChild.localName)
        self.assertEqual('blaA',
                         dom.firstChild.namespaceURI)
        self.assertEqual(None,
                         dom.firstChild.prefix)

        tree = self.parse('<a:a xmlns="blaA" xmlns:a="blaA"></a:a>')
        handler = pulldom.SAX2DOM()
        sax.saxify(tree, handler)
        dom = handler.document

        self.assertEqual('a',
                         dom.firstChild.localName)
        self.assertEqual('blaA',
                         dom.firstChild.namespaceURI)
        self.assertEqual('a',
                         dom.firstChild.prefix)
Exemplo n.º 9
0
def flexify(html_root, flex_path):
    """Insert FLEx annotations after every Zapotec word in the HTML root element."""
    with open(flex_path, 'r', encoding='utf-8') as f:
        flex_dict = json.load(f)
    print('{} words in the FLEx dictionary'.format(len(flex_dict)))
    handler = FLExParser(flex_dict)
    sax.saxify(html_root, handler)
    print('Processed {0.total} word(s), missed {0.missed}'.format(handler))
    return handler.etree
Exemplo n.º 10
0
    def process_node(self, session, data):
        """Walk a DOM structure, extract and return.

        Turn into SAX and process_eventList() for the mean time.
        """
        handler = SaxContentHandler()
        sax.saxify(data, handler)
        saxl = handler.currentText
        return self.process_eventList(session, saxl)
Exemplo n.º 11
0
    def process_node(self, session, data):
        """Walk a DOM structure, extract and return.

        Turn into SAX and process_eventList() for the mean time.
        """
        handler = SaxContentHandler()
        sax.saxify(data, handler)
        saxl = handler.currentText
        return self.process_eventList(session, saxl)
Exemplo n.º 12
0
    def write (self, result_set):
        logging.info ('writing %s' % self.type ())

        if self.__file_target__ is None:
            writer = SAXWriter (result_set.source, 2)
        else:
            writer = SAXWriter (self.__file_target__, 2)

        logging.info ('write from source: %s' % result_set.source)
        resultset_w = etree.Element ('result')
        resultset_w.set ("source", result_set.source)
        resultset_w.set ('args', result_set.args)

        from html import escape

        for weakness in result_set.iterate_Weaknesses ():
            weaknesselement = etree.SubElement (resultset_w, "weakness")
            weaknesselement.set ("id", weakness.name)

            for suite in weakness.iterate_Suites ():
                suiteelement = etree.SubElement (weaknesselement, "suite")

                suiteelement.set ("dir", suite.directory)
                suiteelement.set ("tool", suite.compiler)
                suiteelement.set ("args", suite.args)

                for file in suite.iterate_Files ():
                    for function in file.iterate_Functions ():
                        for line in function.iterate_Lines ():

                            for flaw in line.iterate_Flaws ():

                                flawelement = etree.SubElement (suiteelement, "flaw")
                                attrib = flawelement.attrib

                                attrib['file'] = file.filename
                                attrib['function'] = function.function
                                attrib['line'] = str (line.line)
                                attrib['severity'] = str (flaw.severity.name)
                                attrib['description'] = escape (flaw.description)

                            for bug in line.iterate_Bugs ():
                                bugelement = etree.SubElement (suiteelement, "bug")
                                attrib = bugelement.attrib
                                attrib['filename'] = file.filename
                                attrib['function'] = function.function
                                attrib['line'] = str (line.line)
                                attrib['type'] = bug.type
                                attrib['message'] = escape (bug.message)

        sax.saxify (resultset_w, writer)

        if self.__file_target__ is None:
            logging.info ("Write successful on file: %s" % (result_set.source))
        else:
            logging.info ("Write successful on file: %s" % (self.__file_target__))
Exemplo n.º 13
0
def strip_html(doc):
    tree = etree.fromstring(doc)
    handler = TextOnlyContentHandler()
    sax.saxify(tree, handler)
    links_list = ""
    for i, link in enumerate(handler.links):
        links_list += "\n[%d] %s" % (i + 1, link)

    text = " ".join(handler.text)
    return wrap(text, 72) + "\n\n----" + links_list
Exemplo n.º 14
0
def strip_html(doc):
    tree = etree.fromstring(doc)
    handler = TextOnlyContentHandler()
    sax.saxify(tree, handler)
    links_list = ""
    for i, link in enumerate(handler.links):
        links_list += "\n[%d] %s" % (i + 1, link)

    text = " ".join(handler.text)
    return wrap(text, 72) + "\n\n----" + links_list
Exemplo n.º 15
0
    def __init__(self, content):
        ContentHandler.__init__(self)

        self.content = False
        self.tree = []
        self.stack = []
        self.stack_usage = []

        self.strong = False
        self.emphasis = False

        saxify(content, self)
Exemplo n.º 16
0
def markdown_souptest(text, nofollow=False, target=None, lang=None):
    if not text:
        return text

    smd = safemarkdown(text, nofollow, target, lang)

    s = StringIO(smd)
    tree = lxml.etree.parse(s)
    handler = SouptestSaxHandler(markdown_ok_tags)
    saxify(tree, handler)

    return smd
Exemplo n.º 17
0
def markdown_souptest(text, nofollow=False, target=None, lang=None):
    if not text:
        return text

    smd = safemarkdown(text, nofollow, target, lang)

    s = StringIO(smd)
    tree = lxml.etree.parse(s)
    handler = SouptestSaxHandler(markdown_ok_tags)
    saxify(tree, handler)

    return smd
Exemplo n.º 18
0
def paginate(pseudo_html_root, text_name):
    """
    Paginate the output of the XSLT transformation. This entails removing all <pb/>
    elements and adding <div class="page">...</div> elements to wrap each page. The
    output of this function is valid HTML.
    """
    # TODO [2019-04-26]: Is text_name necessary for anything? It becomes a CSS class
    # that's on the page <div>'s, so we should check the Ticha website's stylesheets
    # to see if it's ever targeted.
    handler = TEIPager(text_name)
    sax.saxify(pseudo_html_root, handler)
    return handler.etree
Exemplo n.º 19
0
    def test_sax_to_pulldom(self):
        tree = self.parse('<a xmlns="blaA">ab<b:b xmlns:b="blaB">bb</b:b>ba</a>')
        handler = pulldom.SAX2DOM()
        sax.saxify(tree, handler)
        dom = handler.document

        self.assertEqual("a", dom.firstChild.localName)
        self.assertEqual("blaA", dom.firstChild.namespaceURI)

        children = dom.firstChild.childNodes
        self.assertEqual("ab", children[0].nodeValue)
        self.assertEqual("blaB", children[1].namespaceURI)
        self.assertEqual("ba", children[2].nodeValue)
Exemplo n.º 20
0
def md2xhtml(infile, outfile):
    pandoc = subprocess.Popen(['pandoc', '-f', 'markdown', '-t', 'html', '-S'],
        stdin=subprocess.PIPE, stdout=subprocess.PIPE)
    fold = subprocess.Popen(['fold', '-s', '-w', '72'],
        stdin=subprocess.PIPE, stdout=subprocess.PIPE)
    pandoc.stdin.write(infile.read())
    pandoc.stdin.close()
    tree = etree.parse(pandoc.stdout, etree.HTMLParser(encoding='utf-8'))
    sax.saxify(tree, Html2Xhtml(fold.stdin))
    fold.stdin.close()
    outfile.write(xhtml_head)
    outfile.write(fold.stdout.read())
    outfile.write(xhtml_tail)
Exemplo n.º 21
0
 def render(self, context):
     tree = self.nameOfTreeLoc.resolve(context)
     language = self.language.resolve(context)
     sourceCodeOrXML = self.isSourceOrXml.resolve(context)
     grammarFileURL = ""
     if isinstance(self.grammarFile, str):
         grammarFileURL = self.grammarFile
     else:
         grammarFileURL = self.grammarFile.resolve(context)
     contentHandler = SyntaxHighlighter(sourceCodeOrXML == "srcML",
                                        language, grammarFileURL)
     lxmlSAX.saxify(ET.fromstring(tree), contentHandler)
     return contentHandler.content
Exemplo n.º 22
0
    def test_sax_to_pulldom(self):
        tree = self.parse(
            '<a xmlns="blaA">ab<b:b xmlns:b="blaB">bb</b:b>ba</a>')
        handler = pulldom.SAX2DOM()
        sax.saxify(tree, handler)
        dom = handler.document

        self.assertEqual('a', dom.firstChild.localName)
        self.assertEqual('blaA', dom.firstChild.namespaceURI)

        children = dom.firstChild.childNodes
        self.assertEqual('ab', children[0].nodeValue)
        self.assertEqual('blaB', children[1].namespaceURI)
        self.assertEqual('ba', children[2].nodeValue)
Exemplo n.º 23
0
 def run_suite(self, suite_api):
     api_code, api_result = self.rest_api_get(suite_api, prefix="")
     if api_result.xpath("count(//TestSet)=0"):
         sax_handler = SaxBase(api_result)
         print(
             escaped(
                 "ERROR",
                 "[ERROR] Legacy suite '{}' failed to execute".format(
                     suite_api)))
         sax_handler.tests = 1
         sax_handler.errors = 1
     else:
         sax_handler = LegacyApiSax(api_result)
         sax.saxify(api_result, sax_handler)
     return sax_handler
Exemplo n.º 24
0
def markdown_souptest(text, nofollow=False, target=None, lang=None):
    if not text:
        return text

    smd = safemarkdown(text, nofollow, target, lang)

    # Prepend a DTD reference so we can load up definitions of all the standard
    # XHTML entities (&nbsp;, etc.).
    smd_with_dtd = markdown_dtd + smd

    s = StringIO(smd_with_dtd)
    parser = lxml.etree.XMLParser(load_dtd=True)
    tree = lxml.etree.parse(s, parser)
    handler = SouptestSaxHandler(markdown_ok_tags)
    saxify(tree, handler)

    return smd
    def loadFromFile(cls, filename):
        path_file = os.path.abspath(filename)
        if not os.path.isfile(path_file):
            err = "Error: '%s' does not exist or is not a file." % filename
            print(err)
            raise Exception(err)

        # Note: parsing a file directly with dexml/minidom is supposedly slower, si I used lxml one, 
        #       but I did not benchmark it.
        tree = etree.parse(path_file)
        handler = SAX2DOM()
        sax.saxify(tree, handler)
        dom = handler.document

        # In case, you can pass the filename to parse() here to skip lxml
        mdl = cls.parse(dom)
        return mdl
Exemplo n.º 26
0
def markdown_souptest(text, nofollow=False, target=None):
    if not text:
        return text

    smd = safemarkdown(text, nofollow=nofollow, target=target)

    # Prepend a DTD reference so we can load up definitions of all the standard
    # XHTML entities (&nbsp;, etc.).
    smd_with_dtd = markdown_dtd + smd

    s = StringIO(smd_with_dtd)
    parser = lxml.etree.XMLParser(load_dtd=True)
    tree = lxml.etree.parse(s, parser)
    handler = SouptestSaxHandler(markdown_ok_tags)
    saxify(tree, handler)

    return smd
Exemplo n.º 27
0
 def query(self, qstr, interval):
     self._descriptor = qstr
     query_opts = {"output":self._output, "from":'0', "qstr":qstr}
     if interval:
         year_cluster = '"{0}"'.format('" OR "'.join(interval))
     else: # Last 3 years
         cur_year = date.today().year
         year_cluster = '"{0}"'.format('" OR "'.join([str(x) for x in range(cur_year -3, cur_year)]))
     query_opts['interval'] = year_cluster
     query_opts.update(self._options)
     template = Template(self._url)
     query = template.substitute(query_opts)
     logger.debug('Querying base: %s\n', self._options['db'])
     response = requests.get(query)
     root = etree.XML(str.encode(response.text, 'UTF-8'))
     handler = DefaultContentHandler()
     saxify(root, handler)
     return handler.articles
Exemplo n.º 28
0
def main():
    ugly = False
    if os.sys.platform[0:3] == 'win':
        ugly = True

    response = urllib2.urlopen(sys.argv[1])
    encoding = response.headers.getparam('charset')
    html = response.read().decode(encoding)

    f = StringIO(html)
    parser = etree.HTMLParser()

    #create SAX tree
    tree = etree.parse(f, parser)

    handler = BoilerpipeHTMLContentHandler()
    sax.saxify(tree, handler)

    a = ArticleExtractor()

    #parses our data and creates TextDocument with TextBlocks
    doc = handler.toTextDocument()

    tw = TextWrapper()
    tw.width = 80
    tw.initial_indent = os.linesep + os.linesep
    parsed_url = urllib2.urlparse.urlparse(sys.argv[1])
    filename = parsed_url.netloc + "-" + "".join([
        c for c in parsed_url.path if c.isalpha() or c.isdigit() or c == ' '
    ]).rstrip() + '.txt'
    output = []
    for line in a.getText(doc).splitlines():
        output.append(tw.fill(line))
    i = 0
    with codecs.open(filename, 'w', encoding='utf8') as f:
        for line in output:
            if ugly:
                line.replace('\n', os.linesep)
            f.write(line)
    print "Article saved. Lines: %s. Filename: %s" % (len(output), filename)
    def parse(cls, xml_string, **parser_kwargs):
        """
        Instantiates an object OOXMLtoLatexParser
         and parse the string given by xml_string

        :param xml_string: An string containing the xml to be
            parsed
        :param parser_kwargs:
            OOXMLtoLatexParser kwargs:
             - math_symbols: list of math symbols
               default to latex_constants.SYMBOLS
        """

        xml_string = OOXMLtoLatexParser.change_xml_double_open_tag_to_left_arrow(xml_string)
        xml_string = OOXMLtoLatexParser._remove_self_closing_tags(xml_string)
        xml_to_latex_parser = cls(**parser_kwargs)

        if isinstance(xml_string, str):
            element = etree.fromstring(xml_string)
            sax.saxify(element, xml_to_latex_parser)
            return xml_to_latex_parser
        else:
            raise TypeError("xml string parameter must be str or unicode")
Exemplo n.º 30
0
    def parse(cls, xml_string, **parser_kwargs):
        """
        Instantiates an object OOXMLtoLatexParser
         and parse the string given by xml_string

        :param xml_string: An string containing the xml to be
            parsed
        :param parser_kwargs:
            OOXMLtoLatexParser kwargs:
             - math_symbols: list of math symbols
               default to latex_constants.SYMBOLS
        """

        xml_string = OOXMLtoLatexParser.change_xml_double_open_tag_to_left_arrow(xml_string)
        xml_string = OOXMLtoLatexParser._remove_self_closing_tags(xml_string)
        xml_to_latex_parser = cls(**parser_kwargs)

        if isinstance(xml_string, basestring):
            element = etree.fromstring(xml_string)
            sax.saxify(element, xml_to_latex_parser)
            return xml_to_latex_parser
        else:
            raise TypeError("xml string parameter must be str or unicode")
Exemplo n.º 31
0
    def write_datapointset (self, datapointset):
        logging.info ('writing %s' % self.type ())
        writer = SAXWriter (self.__file_target__, 2)

        datapointset_x = etree.Element ('datapointset')

        for (source, args) in datapointset.imports.items ():
            xml = etree.SubElement (datapointset_x, 'import')
            xml.set ('source', source)
            xml.set ('args', args)

        for (source, args) in datapointset.builds.items ():
            xml = etree.SubElement (datapointset_x, 'build')
            xml.set ('source', source)
            xml.set ('args', args)

        for criteria in datapointset.iterate_Criterias ():
            criteria_x = etree.SubElement (datapointset_x, 'criteria')

            criteria_x.set ('granularity', criteria.granularity.name)
            criteria_x.set ('wrong_checker_is_fp', str (criteria.wrong_checker_is_fp))
            criteria_x.set ('minimum', str (criteria.minimum))

            for datapoint in criteria.iterate_DataPoints ():
                xml = etree.SubElement (criteria_x, 'datapoint')
                xml.set ('tp', str (datapoint.tp))
                xml.set ('fp', str (datapoint.fp))
                xml.set ('fn', str (datapoint.fn))
                xml.set ('weakness', datapoint.weakness)
                xml.set ('directory', datapoint.directory)
                xml.set ('filename', datapoint.filename)
                xml.set ('function', datapoint.function)
                xml.set ('line', str (datapoint.line))
                xml.set ('permutation', datapoint.permutation)

        sax.saxify (datapointset_x, writer)
Exemplo n.º 32
0
 def run_suite(self, suite_api):
     api_code, api_result = self.rest_api_get(suite_api, prefix="")
     sax_handler = XQSuiteApiSax(api_result)
     sax.saxify(api_result, sax_handler)
     return sax_handler
Exemplo n.º 33
0
        if qname == 'p':
            self.outfile.write('\n\n')
        elif qname in ['em', 'strong']:
            self.outfile.write('}')
        elif qname == 'blockquote':
            self.outfile.write('\\end{quote}\n\n')
        elif qname == 'br':
            self.outfile.write('\\\\\n')
        elif qname == 'h2':
            self.outfile.write('\\mychapter{%s}{%s}{%s}\n\n'
                % (self.curtitle, self.curauthor, self.curdate))

infile = sys.stdin
outfile = sys.stdout


pandoc = subprocess.Popen(['pandoc', '-f', 'markdown', '-t', 'html'],
    stdin=infile, stdout=subprocess.PIPE)
sed = subprocess.Popen(['sed', 's/^[[:space:]]*//'],
    stdin=subprocess.PIPE, stdout=subprocess.PIPE)
coder = utf_8.StreamWriter(sed.stdin)
fold = subprocess.Popen(['fold', '-s', '-w', '72'],
    stdin=sed.stdout, stdout=outfile)
#pandoc.stdin.write(infile.read())
#pandoc.stdin.close()
tree = etree.parse(pandoc.stdout, etree.HTMLParser(encoding='utf-8'))
sax.saxify(tree, Html2Latex(coder))
sed.stdin.close()

#outfile.write(fold.stdout.read())
Exemplo n.º 34
0
 def parse(cls, dom):
     """Converts DOM into paragraphs."""
     handler = cls()
     saxify(dom, handler)
     return handler.content
Exemplo n.º 35
0
 def run(self, startingElement):
     lsax.saxify(startingElement, self)
     result = " ".join(self.text)
     self.text = []
     self.continueReading = True
     return result
Exemplo n.º 36
0
 def parse_file(self, filename):
     parser = lxml.etree.XMLParser(load_dtd=False)
     f = open(filename, 'rb')
     tree = lxml.etree.parse(f, parser)
     handler = SouptestSaxHandler()
     saxify(tree, handler)
Exemplo n.º 37
0
    def getChannels(self):
        return self.channels

    def getPrograms(self):
        return self.programs


# This code takes extremely long time to execute
# Needs to be revised anyway
# One idea is to mark with a special bit channels and programs,
# as an indication that they will be deleted in the future.
f = urllib2.urlopen("https://tvcom.uz/files/xmltv.xml")
tree = etree.parse(f)
f.close()
h = MyContentHandler()
sax.saxify(tree, h)
cursor = connection.cursor(MySQLdb.cursors.DictCursor)
# Lock the tables so that no inconsistencies will occur
# When querying for channels and programs one should perform the following query:
# SELECT * FROM channels WHERE scheduled_for_insertion = FALSE;
# SELECT * FROM program WHERE scheduled_for_insertion = FALSE;
#cursor.execute("LOCK tables channels WRITE, programs WRITE;");
cursor.execute("LOCK tables programs WRITE;")
# Delete tombstones if any...
#cursor.execute("DELETE FROM channels WHERE scheduled_for_insertion = TRUE");
cursor.execute("DELETE FROM programs WHERE scheduled_for_insertion = TRUE")
# Move old ones...
#cursor.execute("UPDATE channels SET scheduled_for_deletion = TRUE");
cursor.execute("UPDATE programs SET scheduled_for_deletion = TRUE")
cursor.execute("UNLOCK tables;")
connection.commit()
Exemplo n.º 38
0
 def parse(cls, dom):
     """Converts DOM into paragraphs."""
     handler = cls()
     saxify(dom, handler)
     return handler.content
Exemplo n.º 39
0
def lxmlparse(f,handler):
	from lxml.etree import parse as lxmlparse
	from lxml.sax import saxify
	etree = lxmlparse(f)
	saxify(etree,handler)
Exemplo n.º 40
0
def normalize_xml(xml, recursively_sort=(), compact=False):
    """Normalizes an XML document.

    The idea is that two semantically equivalent XML documents should be
    normalized into the same canonical representation.  Therefore if two
    documents compare equal after normalization, they are semantically
    equivalent.

    The canonical representation used here has nothing to do with W3C Canonical
    XML.

    This function normalizes indentation, whitespace and newlines (except
    inside text nodes), element attribute order, expands character references,
    expands shorthand notation of empty XML elements ("<br/>" becomes
    "<br></br>").

    If recursively_sort is given, it is a sequence of tags that will have
    test:sort="recursively" automatically appended to their attribute lists in
    the text.  Use it when you cannot or do not want to modify the XML document
    itself.

    If compact is True, nodes that only have text (without newlines) will be
    presented more compactly ("<tag>text</tag>").

    Caveats:
     - normalize_xml does not deal well with text nodes
     - normalize_xml does not help when different prefixes are used for the
       same namespace
     - normalize_xml does not handle all XML features (CDATA sections, inline
       DTDs, processing instructions, comments)
    """

    class Document:

        def __init__(self):
            self.children = []
            self.sort_recursively = False

        def render(self, level=0):
            result = []
            for child in self.children:
                result.append(child.render(level))
            return ''.join(result)

    class Element:

        def __init__(self, parent, tag, attrlist, sort=False,
                     sort_recursively=False):
            self.parent = parent
            self.tag = tag
            self.attrlist = attrlist
            self.children = []
            self.sort = sort
            self.sort_recursively = sort_recursively

        def render(self, level):
            result = []
            indent = '  ' * level
            line = '%s<%s' % (indent, self.tag[1])
            for attr in self.attrlist:
                if len(line + attr) < 78:
                    line += attr
                else:
                    result.append(line)
                    result.append('\n')
                    line = '%s %s%s' % (indent, ' ' * len(self.tag[1]), attr)
            if self.children:
                s = ''.join([child.render(level+1) for child in self.children])
            else:
                s = ''
            if not s:
                result.append('%s/>\n' % line)
            elif (compact and len(self.children) == 1 and '<' not in s
                  and s.count('\n') == 1):
                result.append('%s>%s</%s>\n' % (line, s.strip(), self.tag[1]))
            else:
                result.append('%s>\n' % line)
                result.append(s)
                result.append('%s</%s>\n' % (indent, self.tag[1]))
            return ''.join(result)

        def finalize(self):
            if self.sort:
                self.children.sort(lambda x, y: cmp(x.key, y.key))
            self.key = self.render(0)

    class Text:

        def __init__(self, data):
            self.data = data
            self.key = None

        def render(self, level):
            data = cgi.escape(self.data.strip())
            if data:
                indent = '  ' * level
                return ''.join(['%s%s\n' % (indent, line.strip())
                                for line in data.splitlines()])
            else:
                return ''

    class Handler(ContentHandler):

        def __init__(self):
            self.level = 0
            self.result = []
            self.root = self.cur = Document()
            self.last_text = None
            self._locator = None

        def startElementNS(self, tag, qname, attrs):
            self.startElement(tag, attrs)

        def endElementNS(self, tag, qname):
            self.endElement(tag)

        def startElement(self, tag, attrs):
            sort = sort_recursively = self.cur.sort_recursively
            if attrs:
                if tag in recursively_sort:
                    sort = sort_recursively = True
                attrlist = attrs.items()
                attrlist.sort()
                attrlist = [' %s="%s"' % (k[1], cgi.escape(v, True))
                            for k, v in attrlist]
            else:
                attrlist = []
            child = Element(self.cur, tag, attrlist, sort=sort,
                            sort_recursively=sort_recursively)
            self.cur.children.append(child)
            self.cur = child
            self.last_text = None

        def endElement(self, tag):
            self.cur.finalize()
            self.cur = self.cur.parent
            self.last_text = None

        def characters(self, data):
            if self.last_text is not None:
                self.last_text.data += data
            else:
                self.last_text = Text(data)
                self.cur.children.append(self.last_text)

        def render(self):
            return self.root.render()

    for tag in recursively_sort:
        xml = xml.replace('<%s' % tag,
                          '<%s test:sort="recursively"' % tag)
    handler = Handler()
    tree = etree.XML(xml)
    sax.saxify(tree, handler)
    return ''.join(handler.render())
Exemplo n.º 41
0
 def process_lxml_tree(self, tree):
     handler = MyContentHandler()
     sax.saxify(tree, handler)
     if handler.max_br > 0:
         print(self.inpname, handler.max_br, "строф")
     return handler.etree