def test_input_output(self, filename): """test parsing a file while retaining the doctype""" with open(sample_xml(filename), "rb") as xml_file: xml_output_expected = xml_file.read() root, doctype_dict = xmlio.parse(sample_xml(filename), return_doctype_dict=True) self.assertEqual(xmlio.output(root, None, doctype_dict), xml_output_expected)
def convert_xml(self, doi_id, xml_file, filenames, new_filenames): # Register namespaces xmlio.register_xmlns() root, doctype_dict = xmlio.parse(xml_file, return_doctype_dict=True) soup = self.article_soup(xml_file) if parser.is_poa(soup): # Capitalise subject group values in article categories root = self.subject_group_convert_in_xml(root) pub_date = None if parser.pub_date(soup) is None: # add the published date to the XML pub_date = self.get_pub_date_if_missing(doi_id) root = self.add_pub_date_to_xml(doi_id, pub_date, root) else: pub_date = parser.pub_date(soup) if parser.volume(soup) is None: # Get the pub-date year to calculate the volume year = pub_date[0] volume = year - 2011 self.add_volume_to_xml(doi_id, volume, root) # set the article-id, to overwrite the v2, v3 value if present root = self.set_article_id_xml(doi_id, root) # if pdf file then add self-uri tag if parser.self_uri(soup) is not None and len( parser.self_uri(soup)) == 0: for filename in new_filenames: if filename.endswith('.pdf'): root = self.add_self_uri_to_xml(doi_id, filename, root) # if ds.zip file is there, then add it to the xml poa_ds_zip_file = None for f in new_filenames: if f.endswith('.zip'): poa_ds_zip_file = f if poa_ds_zip_file: root = self.add_poa_ds_zip_to_xml(doi_id, poa_ds_zip_file, root) # Start the file output reparsed_string = xmlio.output(root, type=None, doctype_dict=doctype_dict) # Remove extra whitespace here for PoA articles to clean up and one VoR file too reparsed_string = reparsed_string.replace("\n", '').replace("\t", '') f = open(xml_file, 'wb') f.write(reparsed_string) f.close()
def test_convert_xlink_href(self, name_map, xml_input_filename, xml_expected_filename): xmlio.register_xmlns() root = xmlio.parse(sample_xml(xml_input_filename)) xlink_count = xmlio.convert_xlink_href(root, name_map) xml_output = xmlio.output(root) xml_output_expected = None with open(sample_xml(xml_expected_filename), "rb") as xml_file: xml_output_expected = xml_file.read() self.assertEqual(xml_output, xml_output_expected)
def convert_xml(self, doi_id, xml_file, filenames, new_filenames): # Register namespaces xmlio.register_xmlns() root, doctype_dict = xmlio.parse(xml_file, return_doctype_dict=True) soup = self.article_soup(xml_file) if parser.is_poa(soup): # Capitalise subject group values in article categories root = self.subject_group_convert_in_xml(root) pub_date = None if parser.pub_date(soup) is None: # add the published date to the XML pub_date = self.get_pub_date_if_missing(doi_id) root = self.add_pub_date_to_xml(doi_id, pub_date, root) else: pub_date = parser.pub_date(soup) if parser.volume(soup) is None: # Get the pub-date year to calculate the volume year = pub_date[0] volume = year - 2011 self.add_volume_to_xml(doi_id, volume, root) # set the article-id, to overwrite the v2, v3 value if present root = self.set_article_id_xml(doi_id, root) # if pdf file then add self-uri tag if parser.self_uri(soup) is not None and len(parser.self_uri(soup)) == 0: for filename in new_filenames: if filename.endswith('.pdf'): root = self.add_self_uri_to_xml(doi_id, filename, root) # if ds.zip file is there, then add it to the xml poa_ds_zip_file = None for f in new_filenames: if f.endswith('.zip'): poa_ds_zip_file = f if poa_ds_zip_file: root = self.add_poa_ds_zip_to_xml(doi_id, poa_ds_zip_file, root) # Start the file output reparsed_string = xmlio.output(root, type=None, doctype_dict=doctype_dict) # Remove extra whitespace here for PoA articles to clean up and one VoR file too reparsed_string = reparsed_string.replace("\n", '').replace("\t", '') f = open(xml_file, 'wb') f.write(reparsed_string) f.close()
def rewrite_xml_file(self, xml_filename, file_name_map): local_xml_filename = path.join(self.get_tmp_dir(), xml_filename) xmlio.register_xmlns() root = xmlio.parse(local_xml_filename) # Convert xlink href values total = xmlio.convert_xlink_href(root, file_name_map) # Start the file output reparsed_string = xmlio.output(root) f = open(local_xml_filename, 'wb') f.write(reparsed_string) f.close()
def convert_xml(xml_file, file_name_map): # Register namespaces xmlio.register_xmlns() root, doctype_dict = xmlio.parse(xml_file, return_doctype_dict=True) # Convert xlink href values total = xmlio.convert_xlink_href(root, file_name_map) # TODO - compare whether all file names were converted # Start the file output reparsed_string = xmlio.output(root, type=None, doctype_dict=doctype_dict) f = open(xml_file, 'wb') f.write(reparsed_string) f.close()
def convert_xml(self, xml_file, file_name_map): # Register namespaces xmlio.register_xmlns() root, doctype_dict = xmlio.parse(xml_file, return_doctype_dict=True) # Convert xlink href values total = xmlio.convert_xlink_href(root, file_name_map) # TODO - compare whether all file names were converted # Start the file output reparsed_string = xmlio.output(root, type=None, doctype_dict=doctype_dict) f = open(xml_file, 'wb') f.write(reparsed_string) f.close()
def rewrite_xml_file(self, xml_filename, file_name_map): local_xml_filename = path.join(self.get_tmp_dir(), xml_filename) xmlio.register_xmlns() root, doctype_dict = xmlio.parse(local_xml_filename, return_doctype_dict=True) # Convert xlink href values total = xmlio.convert_xlink_href(root, file_name_map) # Start the file output reparsed_string = xmlio.output(root, type=None, doctype_dict=doctype_dict) f = open(local_xml_filename, 'wb') f.write(reparsed_string) f.close()
def convert_xml(self, doi_id, xml_file, filenames, new_filenames): # Register namespaces xmlio.register_xmlns() root = xmlio.parse(xml_file) soup = self.article_soup(xml_file) if parser.is_poa(soup): # Capitalise subject group values in article categories root = self.subject_group_convert_in_xml(root) if parser.pub_date(soup) is None: # add the published date to the XML root = self.add_pub_date_to_xml(doi_id, root) # set the article-id, to overwrite the v2, v3 value if present root = self.set_article_id_xml(doi_id, root) # if ds.zip file is there, then add it to the xml poa_ds_zip_file = None for f in new_filenames: if f.endswith('.zip'): poa_ds_zip_file = f if poa_ds_zip_file: root = self.add_poa_ds_zip_to_xml(doi_id, poa_ds_zip_file, root) # Start the file output reparsed_string = xmlio.output(root) # Remove extra whitespace here for PoA articles to clean up and one VoR file too reparsed_string = reparsed_string.replace("\n",'').replace("\t",'') f = open(xml_file, 'wb') f.write(reparsed_string) f.close()
def test_output(self, xml, type, xml_expected): root = xmlio.parse(StringIO.StringIO(xml)) xml_output = xmlio.output(root, type) self.assertEqual(xml_output, xml_expected)
def test_input_output_forcing_jats_doctype(self, filename): with open(sample_xml(filename), "rb") as xml_file: xml_output_expected = xml_file.read() root, doctype_dict = xmlio.parse(sample_xml(filename), return_doctype_dict=True) self.assertEqual(xmlio.output(root, 'JATS'), xml_output_expected)
def test_output_processing_instructions(self, xml, doc_type, xml_expected): root, doctype_dict, processing_instructions = xmlio.parse( BytesIO(xml), True, True) xml_output = xmlio.output(root, doc_type, doctype_dict, processing_instructions) self.assertEqual(xml_output.decode("utf-8"), xml_expected)
def test_output(self, xml, doc_type, xml_expected): root = xmlio.parse(BytesIO(xml)) xml_output = xmlio.output(root, doc_type) self.assertEqual(xml_output.decode("utf-8"), xml_expected)