def process_node(self, session, data): # Turn into SAX and process_eventList() for the mean time handler = SaxContentHandler() sax.saxify(data, handler) saxl = handler.currentText return self.process_eventList(session, saxl)
def parse_document(file_name): tree = etree.parse(file_name) handler = ErdmanTransformer() saxify(tree, handler) handler.save_page() # get the last page titles = get_titles(tree) return handler, titles
def fuck_dom(page): page = UnicodeDammit(page).unicode_markup tree = etree.fromstring(page,etree.HTMLParser()) #tree.docinfo.encoding = "utf-8" handler = SAX2DOM() sax.saxify(tree, handler) return handler.document
def get_sax(self, session): if (not self.sax): handler = SaxContentHandler() sax.saxify(self.dom, handler) self.sax = handler.currentText self.sax.append("9 %r" % handler.elementHash) return self.sax
def process_node(self, session, data): # Turn into SAX and process_eventList() for the mean time handler = SaxContentHandler() sax.saxify(data, handler) saxl= handler.currentText return self.process_eventList(session, saxl)
def test_sax_to_pulldom_multiple_namespaces(self): tree = self.parse('<a xmlns="blaA" xmlns:a="blaA"></a>') handler = pulldom.SAX2DOM() sax.saxify(tree, handler) dom = handler.document # With multiple prefix definitions, the node should keep the one # that was actually used, even if the others also are valid. self.assertEqual('a', dom.firstChild.localName) self.assertEqual('blaA', dom.firstChild.namespaceURI) self.assertEqual(None, dom.firstChild.prefix) tree = self.parse('<a:a xmlns="blaA" xmlns:a="blaA"></a:a>') handler = pulldom.SAX2DOM() sax.saxify(tree, handler) dom = handler.document self.assertEqual('a', dom.firstChild.localName) self.assertEqual('blaA', dom.firstChild.namespaceURI) self.assertEqual('a', dom.firstChild.prefix)
def flexify(html_root, flex_path): """Insert FLEx annotations after every Zapotec word in the HTML root element.""" with open(flex_path, 'r', encoding='utf-8') as f: flex_dict = json.load(f) print('{} words in the FLEx dictionary'.format(len(flex_dict))) handler = FLExParser(flex_dict) sax.saxify(html_root, handler) print('Processed {0.total} word(s), missed {0.missed}'.format(handler)) return handler.etree
def process_node(self, session, data): """Walk a DOM structure, extract and return. Turn into SAX and process_eventList() for the mean time. """ handler = SaxContentHandler() sax.saxify(data, handler) saxl = handler.currentText return self.process_eventList(session, saxl)
def write (self, result_set): logging.info ('writing %s' % self.type ()) if self.__file_target__ is None: writer = SAXWriter (result_set.source, 2) else: writer = SAXWriter (self.__file_target__, 2) logging.info ('write from source: %s' % result_set.source) resultset_w = etree.Element ('result') resultset_w.set ("source", result_set.source) resultset_w.set ('args', result_set.args) from html import escape for weakness in result_set.iterate_Weaknesses (): weaknesselement = etree.SubElement (resultset_w, "weakness") weaknesselement.set ("id", weakness.name) for suite in weakness.iterate_Suites (): suiteelement = etree.SubElement (weaknesselement, "suite") suiteelement.set ("dir", suite.directory) suiteelement.set ("tool", suite.compiler) suiteelement.set ("args", suite.args) for file in suite.iterate_Files (): for function in file.iterate_Functions (): for line in function.iterate_Lines (): for flaw in line.iterate_Flaws (): flawelement = etree.SubElement (suiteelement, "flaw") attrib = flawelement.attrib attrib['file'] = file.filename attrib['function'] = function.function attrib['line'] = str (line.line) attrib['severity'] = str (flaw.severity.name) attrib['description'] = escape (flaw.description) for bug in line.iterate_Bugs (): bugelement = etree.SubElement (suiteelement, "bug") attrib = bugelement.attrib attrib['filename'] = file.filename attrib['function'] = function.function attrib['line'] = str (line.line) attrib['type'] = bug.type attrib['message'] = escape (bug.message) sax.saxify (resultset_w, writer) if self.__file_target__ is None: logging.info ("Write successful on file: %s" % (result_set.source)) else: logging.info ("Write successful on file: %s" % (self.__file_target__))
def strip_html(doc): tree = etree.fromstring(doc) handler = TextOnlyContentHandler() sax.saxify(tree, handler) links_list = "" for i, link in enumerate(handler.links): links_list += "\n[%d] %s" % (i + 1, link) text = " ".join(handler.text) return wrap(text, 72) + "\n\n----" + links_list
def __init__(self, content): ContentHandler.__init__(self) self.content = False self.tree = [] self.stack = [] self.stack_usage = [] self.strong = False self.emphasis = False saxify(content, self)
def markdown_souptest(text, nofollow=False, target=None, lang=None): if not text: return text smd = safemarkdown(text, nofollow, target, lang) s = StringIO(smd) tree = lxml.etree.parse(s) handler = SouptestSaxHandler(markdown_ok_tags) saxify(tree, handler) return smd
def paginate(pseudo_html_root, text_name): """ Paginate the output of the XSLT transformation. This entails removing all <pb/> elements and adding <div class="page">...</div> elements to wrap each page. The output of this function is valid HTML. """ # TODO [2019-04-26]: Is text_name necessary for anything? It becomes a CSS class # that's on the page <div>'s, so we should check the Ticha website's stylesheets # to see if it's ever targeted. handler = TEIPager(text_name) sax.saxify(pseudo_html_root, handler) return handler.etree
def test_sax_to_pulldom(self): tree = self.parse('<a xmlns="blaA">ab<b:b xmlns:b="blaB">bb</b:b>ba</a>') handler = pulldom.SAX2DOM() sax.saxify(tree, handler) dom = handler.document self.assertEqual("a", dom.firstChild.localName) self.assertEqual("blaA", dom.firstChild.namespaceURI) children = dom.firstChild.childNodes self.assertEqual("ab", children[0].nodeValue) self.assertEqual("blaB", children[1].namespaceURI) self.assertEqual("ba", children[2].nodeValue)
def md2xhtml(infile, outfile): pandoc = subprocess.Popen(['pandoc', '-f', 'markdown', '-t', 'html', '-S'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) fold = subprocess.Popen(['fold', '-s', '-w', '72'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) pandoc.stdin.write(infile.read()) pandoc.stdin.close() tree = etree.parse(pandoc.stdout, etree.HTMLParser(encoding='utf-8')) sax.saxify(tree, Html2Xhtml(fold.stdin)) fold.stdin.close() outfile.write(xhtml_head) outfile.write(fold.stdout.read()) outfile.write(xhtml_tail)
def render(self, context): tree = self.nameOfTreeLoc.resolve(context) language = self.language.resolve(context) sourceCodeOrXML = self.isSourceOrXml.resolve(context) grammarFileURL = "" if isinstance(self.grammarFile, str): grammarFileURL = self.grammarFile else: grammarFileURL = self.grammarFile.resolve(context) contentHandler = SyntaxHighlighter(sourceCodeOrXML == "srcML", language, grammarFileURL) lxmlSAX.saxify(ET.fromstring(tree), contentHandler) return contentHandler.content
def test_sax_to_pulldom(self): tree = self.parse( '<a xmlns="blaA">ab<b:b xmlns:b="blaB">bb</b:b>ba</a>') handler = pulldom.SAX2DOM() sax.saxify(tree, handler) dom = handler.document self.assertEqual('a', dom.firstChild.localName) self.assertEqual('blaA', dom.firstChild.namespaceURI) children = dom.firstChild.childNodes self.assertEqual('ab', children[0].nodeValue) self.assertEqual('blaB', children[1].namespaceURI) self.assertEqual('ba', children[2].nodeValue)
def run_suite(self, suite_api): api_code, api_result = self.rest_api_get(suite_api, prefix="") if api_result.xpath("count(//TestSet)=0"): sax_handler = SaxBase(api_result) print( escaped( "ERROR", "[ERROR] Legacy suite '{}' failed to execute".format( suite_api))) sax_handler.tests = 1 sax_handler.errors = 1 else: sax_handler = LegacyApiSax(api_result) sax.saxify(api_result, sax_handler) return sax_handler
def markdown_souptest(text, nofollow=False, target=None, lang=None): if not text: return text smd = safemarkdown(text, nofollow, target, lang) # Prepend a DTD reference so we can load up definitions of all the standard # XHTML entities ( , etc.). smd_with_dtd = markdown_dtd + smd s = StringIO(smd_with_dtd) parser = lxml.etree.XMLParser(load_dtd=True) tree = lxml.etree.parse(s, parser) handler = SouptestSaxHandler(markdown_ok_tags) saxify(tree, handler) return smd
def loadFromFile(cls, filename): path_file = os.path.abspath(filename) if not os.path.isfile(path_file): err = "Error: '%s' does not exist or is not a file." % filename print(err) raise Exception(err) # Note: parsing a file directly with dexml/minidom is supposedly slower, si I used lxml one, # but I did not benchmark it. tree = etree.parse(path_file) handler = SAX2DOM() sax.saxify(tree, handler) dom = handler.document # In case, you can pass the filename to parse() here to skip lxml mdl = cls.parse(dom) return mdl
def markdown_souptest(text, nofollow=False, target=None): if not text: return text smd = safemarkdown(text, nofollow=nofollow, target=target) # Prepend a DTD reference so we can load up definitions of all the standard # XHTML entities ( , etc.). smd_with_dtd = markdown_dtd + smd s = StringIO(smd_with_dtd) parser = lxml.etree.XMLParser(load_dtd=True) tree = lxml.etree.parse(s, parser) handler = SouptestSaxHandler(markdown_ok_tags) saxify(tree, handler) return smd
def query(self, qstr, interval): self._descriptor = qstr query_opts = {"output":self._output, "from":'0', "qstr":qstr} if interval: year_cluster = '"{0}"'.format('" OR "'.join(interval)) else: # Last 3 years cur_year = date.today().year year_cluster = '"{0}"'.format('" OR "'.join([str(x) for x in range(cur_year -3, cur_year)])) query_opts['interval'] = year_cluster query_opts.update(self._options) template = Template(self._url) query = template.substitute(query_opts) logger.debug('Querying base: %s\n', self._options['db']) response = requests.get(query) root = etree.XML(str.encode(response.text, 'UTF-8')) handler = DefaultContentHandler() saxify(root, handler) return handler.articles
def main(): ugly = False if os.sys.platform[0:3] == 'win': ugly = True response = urllib2.urlopen(sys.argv[1]) encoding = response.headers.getparam('charset') html = response.read().decode(encoding) f = StringIO(html) parser = etree.HTMLParser() #create SAX tree tree = etree.parse(f, parser) handler = BoilerpipeHTMLContentHandler() sax.saxify(tree, handler) a = ArticleExtractor() #parses our data and creates TextDocument with TextBlocks doc = handler.toTextDocument() tw = TextWrapper() tw.width = 80 tw.initial_indent = os.linesep + os.linesep parsed_url = urllib2.urlparse.urlparse(sys.argv[1]) filename = parsed_url.netloc + "-" + "".join([ c for c in parsed_url.path if c.isalpha() or c.isdigit() or c == ' ' ]).rstrip() + '.txt' output = [] for line in a.getText(doc).splitlines(): output.append(tw.fill(line)) i = 0 with codecs.open(filename, 'w', encoding='utf8') as f: for line in output: if ugly: line.replace('\n', os.linesep) f.write(line) print "Article saved. Lines: %s. Filename: %s" % (len(output), filename)
def parse(cls, xml_string, **parser_kwargs): """ Instantiates an object OOXMLtoLatexParser and parse the string given by xml_string :param xml_string: An string containing the xml to be parsed :param parser_kwargs: OOXMLtoLatexParser kwargs: - math_symbols: list of math symbols default to latex_constants.SYMBOLS """ xml_string = OOXMLtoLatexParser.change_xml_double_open_tag_to_left_arrow(xml_string) xml_string = OOXMLtoLatexParser._remove_self_closing_tags(xml_string) xml_to_latex_parser = cls(**parser_kwargs) if isinstance(xml_string, str): element = etree.fromstring(xml_string) sax.saxify(element, xml_to_latex_parser) return xml_to_latex_parser else: raise TypeError("xml string parameter must be str or unicode")
def parse(cls, xml_string, **parser_kwargs): """ Instantiates an object OOXMLtoLatexParser and parse the string given by xml_string :param xml_string: An string containing the xml to be parsed :param parser_kwargs: OOXMLtoLatexParser kwargs: - math_symbols: list of math symbols default to latex_constants.SYMBOLS """ xml_string = OOXMLtoLatexParser.change_xml_double_open_tag_to_left_arrow(xml_string) xml_string = OOXMLtoLatexParser._remove_self_closing_tags(xml_string) xml_to_latex_parser = cls(**parser_kwargs) if isinstance(xml_string, basestring): element = etree.fromstring(xml_string) sax.saxify(element, xml_to_latex_parser) return xml_to_latex_parser else: raise TypeError("xml string parameter must be str or unicode")
def write_datapointset (self, datapointset): logging.info ('writing %s' % self.type ()) writer = SAXWriter (self.__file_target__, 2) datapointset_x = etree.Element ('datapointset') for (source, args) in datapointset.imports.items (): xml = etree.SubElement (datapointset_x, 'import') xml.set ('source', source) xml.set ('args', args) for (source, args) in datapointset.builds.items (): xml = etree.SubElement (datapointset_x, 'build') xml.set ('source', source) xml.set ('args', args) for criteria in datapointset.iterate_Criterias (): criteria_x = etree.SubElement (datapointset_x, 'criteria') criteria_x.set ('granularity', criteria.granularity.name) criteria_x.set ('wrong_checker_is_fp', str (criteria.wrong_checker_is_fp)) criteria_x.set ('minimum', str (criteria.minimum)) for datapoint in criteria.iterate_DataPoints (): xml = etree.SubElement (criteria_x, 'datapoint') xml.set ('tp', str (datapoint.tp)) xml.set ('fp', str (datapoint.fp)) xml.set ('fn', str (datapoint.fn)) xml.set ('weakness', datapoint.weakness) xml.set ('directory', datapoint.directory) xml.set ('filename', datapoint.filename) xml.set ('function', datapoint.function) xml.set ('line', str (datapoint.line)) xml.set ('permutation', datapoint.permutation) sax.saxify (datapointset_x, writer)
def run_suite(self, suite_api): api_code, api_result = self.rest_api_get(suite_api, prefix="") sax_handler = XQSuiteApiSax(api_result) sax.saxify(api_result, sax_handler) return sax_handler
if qname == 'p': self.outfile.write('\n\n') elif qname in ['em', 'strong']: self.outfile.write('}') elif qname == 'blockquote': self.outfile.write('\\end{quote}\n\n') elif qname == 'br': self.outfile.write('\\\\\n') elif qname == 'h2': self.outfile.write('\\mychapter{%s}{%s}{%s}\n\n' % (self.curtitle, self.curauthor, self.curdate)) infile = sys.stdin outfile = sys.stdout pandoc = subprocess.Popen(['pandoc', '-f', 'markdown', '-t', 'html'], stdin=infile, stdout=subprocess.PIPE) sed = subprocess.Popen(['sed', 's/^[[:space:]]*//'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) coder = utf_8.StreamWriter(sed.stdin) fold = subprocess.Popen(['fold', '-s', '-w', '72'], stdin=sed.stdout, stdout=outfile) #pandoc.stdin.write(infile.read()) #pandoc.stdin.close() tree = etree.parse(pandoc.stdout, etree.HTMLParser(encoding='utf-8')) sax.saxify(tree, Html2Latex(coder)) sed.stdin.close() #outfile.write(fold.stdout.read())
def parse(cls, dom): """Converts DOM into paragraphs.""" handler = cls() saxify(dom, handler) return handler.content
def run(self, startingElement): lsax.saxify(startingElement, self) result = " ".join(self.text) self.text = [] self.continueReading = True return result
def parse_file(self, filename): parser = lxml.etree.XMLParser(load_dtd=False) f = open(filename, 'rb') tree = lxml.etree.parse(f, parser) handler = SouptestSaxHandler() saxify(tree, handler)
def getChannels(self): return self.channels def getPrograms(self): return self.programs # This code takes extremely long time to execute # Needs to be revised anyway # One idea is to mark with a special bit channels and programs, # as an indication that they will be deleted in the future. f = urllib2.urlopen("https://tvcom.uz/files/xmltv.xml") tree = etree.parse(f) f.close() h = MyContentHandler() sax.saxify(tree, h) cursor = connection.cursor(MySQLdb.cursors.DictCursor) # Lock the tables so that no inconsistencies will occur # When querying for channels and programs one should perform the following query: # SELECT * FROM channels WHERE scheduled_for_insertion = FALSE; # SELECT * FROM program WHERE scheduled_for_insertion = FALSE; #cursor.execute("LOCK tables channels WRITE, programs WRITE;"); cursor.execute("LOCK tables programs WRITE;") # Delete tombstones if any... #cursor.execute("DELETE FROM channels WHERE scheduled_for_insertion = TRUE"); cursor.execute("DELETE FROM programs WHERE scheduled_for_insertion = TRUE") # Move old ones... #cursor.execute("UPDATE channels SET scheduled_for_deletion = TRUE"); cursor.execute("UPDATE programs SET scheduled_for_deletion = TRUE") cursor.execute("UNLOCK tables;") connection.commit()
def lxmlparse(f,handler): from lxml.etree import parse as lxmlparse from lxml.sax import saxify etree = lxmlparse(f) saxify(etree,handler)
def normalize_xml(xml, recursively_sort=(), compact=False): """Normalizes an XML document. The idea is that two semantically equivalent XML documents should be normalized into the same canonical representation. Therefore if two documents compare equal after normalization, they are semantically equivalent. The canonical representation used here has nothing to do with W3C Canonical XML. This function normalizes indentation, whitespace and newlines (except inside text nodes), element attribute order, expands character references, expands shorthand notation of empty XML elements ("<br/>" becomes "<br></br>"). If recursively_sort is given, it is a sequence of tags that will have test:sort="recursively" automatically appended to their attribute lists in the text. Use it when you cannot or do not want to modify the XML document itself. If compact is True, nodes that only have text (without newlines) will be presented more compactly ("<tag>text</tag>"). Caveats: - normalize_xml does not deal well with text nodes - normalize_xml does not help when different prefixes are used for the same namespace - normalize_xml does not handle all XML features (CDATA sections, inline DTDs, processing instructions, comments) """ class Document: def __init__(self): self.children = [] self.sort_recursively = False def render(self, level=0): result = [] for child in self.children: result.append(child.render(level)) return ''.join(result) class Element: def __init__(self, parent, tag, attrlist, sort=False, sort_recursively=False): self.parent = parent self.tag = tag self.attrlist = attrlist self.children = [] self.sort = sort self.sort_recursively = sort_recursively def render(self, level): result = [] indent = ' ' * level line = '%s<%s' % (indent, self.tag[1]) for attr in self.attrlist: if len(line + attr) < 78: line += attr else: result.append(line) result.append('\n') line = '%s %s%s' % (indent, ' ' * len(self.tag[1]), attr) if self.children: s = ''.join([child.render(level+1) for child in self.children]) else: s = '' if not s: result.append('%s/>\n' % line) elif (compact and len(self.children) == 1 and '<' not in s and s.count('\n') == 1): result.append('%s>%s</%s>\n' % (line, s.strip(), self.tag[1])) else: result.append('%s>\n' % line) result.append(s) result.append('%s</%s>\n' % (indent, self.tag[1])) return ''.join(result) def finalize(self): if self.sort: self.children.sort(lambda x, y: cmp(x.key, y.key)) self.key = self.render(0) class Text: def __init__(self, data): self.data = data self.key = None def render(self, level): data = cgi.escape(self.data.strip()) if data: indent = ' ' * level return ''.join(['%s%s\n' % (indent, line.strip()) for line in data.splitlines()]) else: return '' class Handler(ContentHandler): def __init__(self): self.level = 0 self.result = [] self.root = self.cur = Document() self.last_text = None self._locator = None def startElementNS(self, tag, qname, attrs): self.startElement(tag, attrs) def endElementNS(self, tag, qname): self.endElement(tag) def startElement(self, tag, attrs): sort = sort_recursively = self.cur.sort_recursively if attrs: if tag in recursively_sort: sort = sort_recursively = True attrlist = attrs.items() attrlist.sort() attrlist = [' %s="%s"' % (k[1], cgi.escape(v, True)) for k, v in attrlist] else: attrlist = [] child = Element(self.cur, tag, attrlist, sort=sort, sort_recursively=sort_recursively) self.cur.children.append(child) self.cur = child self.last_text = None def endElement(self, tag): self.cur.finalize() self.cur = self.cur.parent self.last_text = None def characters(self, data): if self.last_text is not None: self.last_text.data += data else: self.last_text = Text(data) self.cur.children.append(self.last_text) def render(self): return self.root.render() for tag in recursively_sort: xml = xml.replace('<%s' % tag, '<%s test:sort="recursively"' % tag) handler = Handler() tree = etree.XML(xml) sax.saxify(tree, handler) return ''.join(handler.render())
def process_lxml_tree(self, tree): handler = MyContentHandler() sax.saxify(tree, handler) if handler.max_br > 0: print(self.inpname, handler.max_br, "строф") return handler.etree