def test_tostring_result_depends_on_the_param_remove_blank_text_of_load_xml( self): xml_input = ( "<root><source><italic>texto 1</italic> <italic>texto 2</italic>" "</source></root>") xml_with_blank_text_as_false, errors = xml_utils.load_xml( xml_input, remove_blank_text=False) result_with_blank_text_as_false = xml_utils.tostring( xml_with_blank_text_as_false) xml_with_blank_text_as_true, errors = xml_utils.load_xml( xml_input, remove_blank_text=True) result_with_blank_text_as_true = xml_utils.tostring( xml_with_blank_text_as_true) self.assertNotEqual(result_with_blank_text_as_false, result_with_blank_text_as_true) self.assertEqual( result_with_blank_text_as_false, "<root>" "<source><italic>texto 1</italic> <italic>" "texto 2</italic></source>" "</root>") self.assertEqual( result_with_blank_text_as_true, "<root>" "<source><italic>texto 1</italic><italic>" "texto 2</italic></source>" "</root>")
def test_merge_siblings_style_tags_content_does_not_merge_sup(self): text = "<root><source><sup>texto 1</sup> <sup>texto 2</sup> </source></root>" expected = "<source><sup>texto 1</sup> <sup>texto 2</sup> </source>" obj = xml_utils.etree.fromstring(text) node = obj.find(".//source") xml_utils.merge_siblings_style_tags_content(node, ('bold', 'italic')) result = xml_utils.tostring(node) self.assertEqual(result, expected)
def test_remove_styles_off_tagged_content_does_not_remove_italic(self): text = "<root><source>texto 1 <italic>texto italic</italic> texto 2</source></root>" expected = "<source>texto 1 <italic>texto italic</italic> texto 2</source>" obj = xml_utils.etree.fromstring(text) node = obj.find(".//source") xml_utils.remove_styles_off_tagged_content(node, ('bold', 'italic')) result = xml_utils.tostring(node) self.assertEqual(result, expected)
def test_merge_siblings_style_tags_content_does_not_merge_italic_if_there_are_elements_in_the_middle( self): text = "<root><source><italic>texto 1</italic> <bold>texto</bold> <italic>texto 2</italic></source></root>" expected = "<source><italic>texto 1</italic> <bold>texto</bold> <italic>texto 2</italic></source>" obj = xml_utils.etree.fromstring(text) node = obj.find(".//source") xml_utils.merge_siblings_style_tags_content(node, ('bold', 'italic')) result = xml_utils.tostring(node) self.assertEqual(result, expected)
def test_remove_styles_off_tagged_content_removes_external_and_keeps_inner( self): text = "<root><source><bold>texto 1 <bold>texto bold</bold> texto 2</bold></source></root>" expected = "<source>texto 1 <bold>texto bold</bold> texto 2</source>" obj = xml_utils.etree.fromstring(text) node = obj.find(".//source") xml_utils.remove_styles_off_tagged_content(node, ('bold', 'italic')) result = xml_utils.tostring(node) self.assertEqual(result, expected)
def load_articles(filenames): files = {} for name, f in filenames.items(): xmltree, errors = xml_utils.load_xml(f) if xmltree is not None: files[name] = xml_utils.tostring(xmltree.getroot()) else: print(' ERROR 1: {} - {}'.format(name, errors)) return files
def write_etree_to_file(tree: etree.ElementTree, path: str) -> None: """Escreve uma árvore lxml em um arquivo de destino. Também garante que as entidades não serão modificadas por meio da função xml_utils.tostring(etree).""" if tree is None or path is None: return None fs_utils.write_file(path, xml_utils.tostring(tree))
def test_load_xml_with_remove_blank_text_as_false_keep_blanks(self): xml_input = ( "<root><source><italic>texto 1</italic> <italic>texto 2</italic>" "</source></root>") xml, errors = xml_utils.load_xml( "<root><source><italic>texto 1</italic> <italic>texto 2</italic>" "</source></root>", remove_blank_text=False) result = xml_utils.tostring(xml) self.assertEqual(xml_input, result)
def test_load_xml_with_remove_blank_text_as_true_remove_blanks(self): xml_input = ( "<root><source><italic>texto 1</italic> <italic>texto 2</italic>" "</source></root>") expected = ( "<root><source><italic>texto 1</italic><italic>texto 2</italic>" "</source></root>") xml, errors = xml_utils.load_xml(xml_input, remove_blank_text=True) result = xml_utils.tostring(xml) self.assertEqual(expected, result)
def insert_ext_link_elements_in_mixed_citation(self): """ Se no texto de mixed-citation há links não identificados como ext-link, inserir ext-link baseados nos ext-links existentes em element-citation """ links = self.tree.findall(".//mixed-citation//ext-link") if links: return mixed_citation = self.tree.find(".//mixed-citation") if mixed_citation is None: return links = self.tree.findall(".//element-citation//ext-link") if not links: return mixed_citation_text = xml_utils.tostring(mixed_citation) for link in links: mixed_citation_text = mixed_citation_text.replace( link.text, xml_utils.tostring(link)) new_mixed_citation = xml_utils.etree.fromstring(mixed_citation_text) parent = mixed_citation.getparent() parent.replace(mixed_citation, new_mixed_citation)
def test_strip_all_tags_except_removes_all_a_except_a_with_href(self): text = """<root><p> <a>Texto 1</a> <a href="x">Ciência</a> <a href="y">Arte</a> <a>Texto 2</a> </p></root>""" expected = """<p> Texto 1 <a href="x">Ciência</a> <a href="y">Arte</a> Texto 2 </p>""" xml = xml_utils.etree.fromstring(text) node = xml.find(".//p") xml_utils.strip_all_tags_except(node, [".//a[@href]"]) result = xml_utils.tostring(node) self.assertEqual(expected, result)
def _get_filenames(self, tree): files = [] delete = False rename = False tiff_items = self.scielo_pkg_files.tiff_name_and_basename_items for node in article.nodes_which_have_xlink_href(tree): if node.get("specific-use") == "scielo-web": node.tag = "REMOVE" delete = True continue href = node.attrib['{http://www.w3.org/1999/xlink}href'] name, ext = os.path.splitext(href) # substitui o valor de href por ativo digital em tiffs tiff = tiff_items.get(name) if tiff and href != tiff: rename = True node.set("{http://www.w3.org/1999/xlink}href", tiff) href = tiff # remove o sufixo -en dos ativos digitais da versao ingles name, ext = os.path.splitext(href) if name.endswith("-en"): new = name[:-3] + ext files.append((href, new)) rename = True node.set("{http://www.w3.org/1999/xlink}href", new) else: files.append((href, href)) if delete: xml_utils.etree.strip_tags(tree, "REMOVE") for node in tree.findall(".//alternatives"): if len(node.getchildren()) == 1: logger.info("Remove alternatives: {}".format( xml_utils.tostring(node))) node.tag = "REMOVE" xml_utils.etree.strip_tags(tree, "REMOVE") return files, delete or rename
def __init__(self, tree): self.tree = tree self.content = xml_utils.tostring(self.tree)