def test_init_xml_is_ok(self): text = "<doc/>" suitable_xml = xml_utils.SuitableXML(text) self.assertEqual(suitable_xml.content, "<doc/>") self.assertIsNone(suitable_xml.xml_error) self.assertEqual(suitable_xml.doctype, '') self.assertIsNone(suitable_xml.xml_declaration)
def test_write_should_write_corrected_xml_in_dest_file(self): text = ( '<?xml version="1.0" encoding="utf-8"?>' '<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal ' 'Publishing DTD v1.1 20151215//EN" ' '"https://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">' '<article><p>&lt;</p></article>') with tempfile.TemporaryDirectory() as xml_dir_path: in_xml_path = os.path.join(xml_dir_path, "in_xml_doc.xml") with open(in_xml_path, 'w') as xml_file: xml_file.write(text) out_xml_path = os.path.join(xml_dir_path, "out_xml_doc.xml") suitable_xml = xml_utils.SuitableXML(in_xml_path) suitable_xml.write(out_xml_path) out_xml = xml_utils.etree.parse(out_xml_path) self.assertIsNotNone(out_xml.docinfo) self.assertEqual(out_xml.docinfo.xml_version, "1.0") self.assertEqual(out_xml.docinfo.encoding, "UTF-8") self.assertEqual( out_xml.docinfo.doctype, '<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal ' 'Publishing DTD v1.1 20151215//EN" ' '"https://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">' ) self.assertEqual(xml_utils.etree.tostring(out_xml.getroot()), b'<article>\n <p><</p>\n</article>') self.assertIsNone(suitable_xml.xml_error)
def make_xml(self, scielo_dtd_files, pmc_dtd_files): sps_xml = self.article.tree # j1.1/xsl/sgml2xml/xml2pmc.xsl pmc_xml = xml_utils.transform(sps_xml, scielo_dtd_files.xsl_output) xml_utils.write(self.pmc_xml_filepath, pmc_xml) # recarrega pmc_xml = xml_utils.SuitableXML(self.pmc_xml_filepath) filenames, changed = self._get_filenames(pmc_xml.xml) numbers = self._insert_math_id(pmc_xml.xml) if numbers or changed: pmc_xml.write(self.pmc_xml_filepath) dirname = os.path.dirname(self.pmc_xml_filepath) for old, new in filenames: old = os.path.join(self.scielo_pkg_files.path, old) if os.path.isfile(old): new = os.path.join(dirname, new) shutil.copyfile(old, new) else: logging.info("File not found %s to compose PMC Package %s", old, self.pmc_xml_filepath) # j1.1/xsl/sgml2xml/pmc.xsl result = xml_utils.transform(pmc_xml.xml, pmc_dtd_files.xsl_output) xml_utils.write(self.pmc_xml_filepath, result) # validate xml_validator = sps_xml_validators.PMCXMLValidator(pmc_dtd_files) xml_validator.validate(self.pmc_xml_filepath, self.outputs.pmc_dtd_report_filename, self.outputs.pmc_style_report_filename)
def test_write_should_write_original_content_if_input_is_not_xml(self): text = "Qualquer texto nao XML." suitable_xml = xml_utils.SuitableXML(text) with tempfile.TemporaryDirectory() as xml_dir_path: xml_path = os.path.join(xml_dir_path, "xml_doc.xml") suitable_xml.write(xml_path) with open(xml_path) as xml_file: self.assertEqual(xml_file.read(), text) self.assertIsNotNone(suitable_xml.xml_error) self.assertIn("it must be an XML content or XML file path", suitable_xml.xml_error)
def __init__(self, src_pkgfiles, acron, dest_path): self.src_pkgfiles = src_pkgfiles self.acron = acron self.dest_path = dest_path self.xml = xml_utils.SuitableXML(self.src_pkgfiles.filename) self.new_name = self.xml.xml self.related_files_copy = [] self.href_replacements = [] self.href_files_copy = [] self.href_names = [] self.missing_href_files = []
def test_well_formed_xml_content_removes_extra_spaces(self): text = """<doc><p><title>is nunc. Scelerisque in dictum non consectetur a erat nam. Ipsum dolor sit amet consectetur\t adipiscing elit duis tristique sollicitudin. \n Eu scelerisque felis imperdiet proin fermen</title></p></doc>""" expected = ( '<doc><p><title>is nunc. Scelerisque in dictum non ' 'consectetur a erat nam. Ipsum dolor sit amet consectetur ' 'adipiscing elit duis tristique sollicitudin. Eu scelerisque ' 'felis imperdiet proin fermen</title></p></doc>') suitable_xml = xml_utils.SuitableXML(text) suitable_xml.well_formed_xml_content() self.assertEqual(expected, suitable_xml.content)
def _insert_xhtml_tables_in_document(self): for xhtml in self.xml.findall(".//xhtml"): href = xhtml.get("href") if not href: continue table_file_path = os.path.join(self.src_pkgfiles.path, href) if not os.path.isfile(table_file_path): continue xml_table = xml_utils.SuitableXML(table_file_path) if not xml_table.xml: continue table = xml_table.xml.find(".//table") if table is not None: parent = xhtml.getparent() parent.replace(xhtml, deepcopy(table))
def test_write_should_write_original_content_if_input_is_invalid_xml(self): text = ( '<?xml version="1.0" encoding="utf-8"?>' '<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal ' 'Publishing DTD v1.1 20151215//EN" ' '"https://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">' '\n<article>' '<p><ext-link ext-link-type="uri" xlink:href="<link-invalido>">bla</ext-link></p>' '</article>') suitable_xml = xml_utils.SuitableXML(text) with tempfile.TemporaryDirectory() as xml_dir_path: xml_path = os.path.join(xml_dir_path, "xml_doc.xml") suitable_xml.write(xml_path) with open(xml_path) as xml_file: self.assertEqual(xml_file.read(), text) self.assertIsNotNone(suitable_xml.xml_error) self.assertIn("Loading XML from 'str': ", suitable_xml.xml_error)
def test_init_xml_with_no_xml_declaration(self): text = ( '<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal ' 'Publishing DTD v1.1 20151215//EN" ' '"https://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">' '\n<article/> lixo') suitable_xml = xml_utils.SuitableXML(text) self.assertIsNone(suitable_xml.xml_declaration) self.assertEqual( suitable_xml.doctype, '<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal ' 'Publishing DTD v1.1 20151215//EN" "https://jats.nlm.nih.gov/' 'publishing/1.1/JATS-journalpublishing1.dtd">') self.assertEqual( suitable_xml.content, '<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal ' 'Publishing DTD v1.1 20151215//EN" "https://jats.nlm.nih.gov/' 'publishing/1.1/JATS-journalpublishing1.dtd">\n<article/>')
def test_well_formed_xml_content_converts_quot_ent_to_chars(self): text = '<doc><p><a href="bla">teste</a></p></doc>' expected = '<doc><p><a href="bla">teste</a></p></doc>' suitable_xml = xml_utils.SuitableXML(text) suitable_xml.well_formed_xml_content() self.assertEqual(expected, suitable_xml.content)
def test_well_formed_xml_content_converts_entities_to_chars(self): text = '<doc><p>[ç]</p></doc>' expected = '<doc><p>[ç]</p></doc>' suitable_xml = xml_utils.SuitableXML(text) suitable_xml.well_formed_xml_content() self.assertEqual(expected, suitable_xml.content)
def test_well_formed_xml_content_removes_junk_after_last_close_tag(self): text = '<doc><p></p></doc> lixo' expected = '<doc><p/></doc>' suitable_xml = xml_utils.SuitableXML(text) suitable_xml.well_formed_xml_content() self.assertEqual(expected, suitable_xml.content)
def test_content_returns_characteres_instead_their_entities(self): text = ('<doc><p>[ç]</p> lixo</doc>') expected = ('<doc><p>[ç]</p> lixo</doc>') suitable_xml = xml_utils.SuitableXML(text) self.assertEqual(expected, suitable_xml.content)
def test_init_xml_with_no_doctype(self): text = '<?xml version="1.0" encoding="utf-8"?><doc/>' suitable_xml = xml_utils.SuitableXML(text) self.assertEqual(suitable_xml.xml_declaration, '<?xml version="1.0" encoding="utf-8"?>') self.assertEqual(suitable_xml.doctype, '')
def test_init_xml_with_junk_is_loaded_without_errors(self): text = "<doc/> lixo" suitable_xml = xml_utils.SuitableXML(text) self.assertEqual(suitable_xml.content, "<doc/>") self.assertIsNone(suitable_xml.xml_error)