def test_etree_sax_ns_attributes(self): handler = sax.ElementTreeContentHandler() handler.startDocument() self.assertRaises( ValueError, handler.startElement, "a", {"blaA:attr_a1": "a1"} )
def TestOneInput(data): try: f = io.BytesIO(data) parsed = et.parse(f) handler = sax.ElementTreeContentHandler() sax.ElementTreeProducer(parsed, handler).saxify() except et.LxmlError: None
def test_etree_sax_no_ns_attributes(self): handler = sax.ElementTreeContentHandler() handler.startDocument() handler.startElement("a", {"attr_a1": "a1"}) handler.startElement("b", {"attr_b1": "b1"}) handler.endElement("b") handler.endElement("a") handler.endDocument() new_tree = handler.etree root = new_tree.getroot() self.assertEqual("a", root.tag) self.assertEqual("b", root[0].tag) self.assertEqual("a1", root.attrib["attr_a1"]) self.assertEqual("b1", root[0].attrib["attr_b1"])
def startElement(self, tag, attrib): """ Called when an XML element starts. """ if tag == constants.EVENT_TAG_EVENT: self.event_handler = sax.ElementTreeContentHandler() self.event_handler.startDocument() self.event_started = True if self.event_started: # ugly, but necessary (incompatibilities between lxml and sax) attributes = {} if attrib.getLength() > 0: for key in attrib.keys(): attributes[(None, key)] = attrib[key] self.event_handler.startElement(tag, attributes)
def test_etree_sax_no_ns(self): handler = sax.ElementTreeContentHandler() handler.startDocument() handler.startElement("a", {}) handler.startElement("b", {}) handler.endElement("b") handler.startElement("c") # with empty attributes handler.endElement("c") handler.endElement("a") handler.endDocument() new_tree = handler.etree root = new_tree.getroot() self.assertEqual("a", root.tag) self.assertEqual("b", root[0].tag) self.assertEqual("c", root[1].tag)
def test_etree_sax_redefine_ns(self): handler = sax.ElementTreeContentHandler() handler.startDocument() handler.startPrefixMapping("ns", "blaA") handler.startElementNS(("blaA", "a"), "ns:a", {}) handler.startPrefixMapping("ns", "blaB") handler.startElementNS(("blaB", "b"), "ns:b", {}) handler.endElementNS(("blaB", "b"), "ns:b") handler.endPrefixMapping("ns") handler.startElementNS(("blaA", "c"), "ns:c", {}) handler.endElementNS(("blaA", "c"), "ns:c") handler.endElementNS(("blaA", "a"), "ns:a") handler.endPrefixMapping("ns") handler.endDocument() new_tree = handler.etree root = new_tree.getroot() self.assertEqual("{blaA}a", root.tag) self.assertEqual("{blaB}b", root[0].tag) self.assertEqual("{blaA}c", root[1].tag)
def test_etree_sax_handler_default_ns_None(self): handler = sax.ElementTreeContentHandler() handler.startDocument() handler.startPrefixMapping(None, "blaA") handler.startElementNS((None, "a"), "a", {}) handler.startPrefixMapping(None, "blaB") handler.startElementNS((None, "b"), "b", {}) handler.endElementNS((None, "b"), "b") handler.endPrefixMapping(None) handler.startElementNS((None, "c"), "c", {}) handler.endElementNS((None, "c"), "c") handler.endElementNS((None, "a"), "a") handler.endPrefixMapping(None) handler.endDocument() new_tree = handler.etree root = new_tree.getroot() self.assertEqual("{blaA}a", root.tag) self.assertEqual("{blaB}b", root[0].tag) self.assertEqual("{blaA}c", root[1].tag)
def test_etree_sax_redefine_ns(self): handler = sax.ElementTreeContentHandler() handler.startDocument() handler.startPrefixMapping('ns', 'blaA') handler.startElementNS(('blaA', 'a'), 'ns:a', {}) handler.startPrefixMapping('ns', 'blaB') handler.startElementNS(('blaB', 'b'), 'ns:b', {}) handler.endElementNS(('blaB', 'b'), 'ns:b') handler.endPrefixMapping('ns') handler.startElementNS(('blaA', 'c'), 'ns:c', {}) handler.endElementNS(('blaA', 'c'), 'ns:c') handler.endElementNS(('blaA', 'a'), 'ns:a') handler.endPrefixMapping('ns') handler.endDocument() new_tree = handler.etree root = new_tree.getroot() self.assertEqual('{blaA}a', root.tag) self.assertEqual('{blaB}b', root[0].tag) self.assertEqual('{blaA}c', root[1].tag)
def test_etree_sax_handler_default_ns_None(self): handler = sax.ElementTreeContentHandler() handler.startDocument() handler.startPrefixMapping(None, 'blaA') handler.startElementNS((None, 'a'), 'a', {}) handler.startPrefixMapping(None, 'blaB') handler.startElementNS((None, 'b'), 'b', {}) handler.endElementNS((None, 'b'), 'b') handler.endPrefixMapping(None) handler.startElementNS((None, 'c'), 'c', {}) handler.endElementNS((None, 'c'), 'c') handler.endElementNS((None, 'a'), 'a') handler.endPrefixMapping(None) handler.endDocument() new_tree = handler.etree root = new_tree.getroot() self.assertEqual('{blaA}a', root.tag) self.assertEqual('{blaB}b', root[0].tag) self.assertEqual('{blaA}c', root[1].tag)
def _saxify_unsaxify(self, saxifiable): handler = sax.ElementTreeContentHandler() sax.ElementTreeProducer(saxifiable, handler).saxify() return handler.etree
def test_etree_sax_error2(self): handler = sax.ElementTreeContentHandler() handler.startDocument() handler.startElement("a") handler.startElement("b") self.assertRaises(sax.SaxError, handler.endElement, "a")
def make_pars(self, pars, parent_el, left_strip_text=None, last_page_label=None, include_block_label=False): """ Make each <p class='label'> or <label> element. """ for par in pars: if self.redacted and par.get('redacted'): continue handler = sax.ElementTreeContentHandler() tag_stack = [] open_tags = set() # opening tag if self.format == 'xml': par_attrs = {'id': par['id']} # special handling for duplicative files -- alto block label gets applied as casemets paragraph label attr if include_block_label and par['block_ids']: first_block = self.blocks_by_id[par['block_ids'][0]] if 'class' in first_block and first_block['class'] != 'p': par_attrs['label'] = first_block['class'] tag_stack.append((handler.startElement, ( par['class'], par_attrs, ))) else: if par['class'] == 'p': tag = ( 'p', { 'id': par['id'] }, ) elif par['class'] == 'blockquote': tag = ( 'blockquote', { 'id': par['id'] }, ) else: tag = ( par_class_to_tag.get(par['class'], 'p'), { 'class': par['class'], 'id': par['id'] }, ) tag_stack.append((handler.startElement, tag)) # write each block in the paragraph for block_id in par['block_ids']: block = self.blocks_by_id[block_id] # write <page-number> or <a class='page-label'> between blocks if not self.original_xml: page_label = self.labels_by_block_id[block_id] if page_label != last_page_label: if last_page_label is not None: if self.format == 'xml': tag_stack.append((handler.startElement, ( 'page-number', { 'label': page_label, 'citation-index': '1' }, ))) tag_stack.append( (handler.characters, ('*' + page_label, ))) tag_stack.append( (handler.endElement, ('page-number', ))) else: tag_stack.append((handler.startElement, ( 'a', { 'id': 'p' + page_label, 'href': '#p' + page_label, 'data-label': page_label, 'data-citation-index': '1', 'class': 'page-label' }, ))) tag_stack.append( (handler.characters, ('*' + page_label, ))) tag_stack.append((handler.endElement, ('a', ))) last_page_label = page_label # write <img> if block.get('format') == 'image' and not ( self.redacted and block.get('redacted')): if self.original_xml: tag_stack.append( (handler.characters, ('[[Image here]]', ))) else: tag_stack.append((handler.startElement, ( 'img', { 'src': 'data:' + block['data'], 'class': block['class'], 'width': str(round(block['rect'][2])), 'height': str(round(block['rect'][3])) }, ))) tag_stack.append((handler.endElement, ('img', ))) # write tokens else: open_font_tags = [] for token in filter_tokens(block, self.html_token_filter, self.redacted): # text token if type(token) == str: if left_strip_text: while left_strip_text and token: if left_strip_text[0] == token[0]: left_strip_text = left_strip_text[1:] token = token[1:] else: left_strip_text = None tag_stack.append((handler.characters, (token, ))) continue token_name, token_attrs = (token + [{}])[:2] # handle opening and closing font tags if token_name == 'font': if self.original_xml: continue font_obj = self.fonts_by_id[token_attrs['id']] open_font_tags = [ tag for tag, font_string in self.font_style_map if font_string in font_obj.style ] self.open_font_tags(handler, tag_stack, open_font_tags) elif token_name == '/font': if self.original_xml: continue self.close_font_tags(handler, tag_stack, open_font_tags) open_font_tags = [] # handle footnotemark and bracketnum elif token_name == 'footnotemark' or token_name == 'bracketnum': if self.original_xml: tag_stack.append( (handler.startElement, (token_name, ))) elif self.format == 'xml': with self.wrap_font_tags( handler, tag_stack, open_font_tags): tag_stack.append( (handler.startElement, (token_name, ))) else: attrs = {'class': token_name} ref = token_attrs.get('ref') if ref: attrs['href'] = '#' + ref attrs['id'] = 'ref_' + ref with self.wrap_font_tags( handler, tag_stack, open_font_tags): tag_stack.append((handler.startElement, ( 'a', attrs, ))) open_tags.add(token_name) elif token_name == '/footnotemark' or token_name == '/bracketnum': # we could hit a close tag without an open tag, if the open tag was in a previous redacted block tag_name = token_name[1:] if tag_name in open_tags: with self.wrap_font_tags( handler, tag_stack, open_font_tags): tag_stack.append( (handler.endElement, (token_name[1:] if self.format == 'xml' else 'a', ))) open_tags.remove(tag_name) # run all of our commands, like "handler.startElement(*args)", to actually build the xml tree for method, args in tag_stack: method(*args) # remove empty tags, which would typically be created by redacted spans par_el = handler._root remove_empty_tags(par_el, ignore_tags={'img'}) # append element if not empty (contents not redacted) if par_el.text or len(par_el): parent_el.append(par_el) return last_page_label