def convert_xml(self, doi_id, xml_file, filenames, new_filenames): # Register namespaces xmlio.register_xmlns() root, doctype_dict = xmlio.parse(xml_file, return_doctype_dict=True) soup = self.article_soup(xml_file) if parser.is_poa(soup): # Capitalise subject group values in article categories root = self.subject_group_convert_in_xml(root) pub_date = None if parser.pub_date(soup) is None: # add the published date to the XML pub_date = self.get_pub_date_if_missing(doi_id) root = self.add_pub_date_to_xml(doi_id, pub_date, root) else: pub_date = parser.pub_date(soup) if parser.volume(soup) is None: # Get the pub-date year to calculate the volume year = pub_date[0] volume = year - 2011 self.add_volume_to_xml(doi_id, volume, root) # set the article-id, to overwrite the v2, v3 value if present root = self.set_article_id_xml(doi_id, root) # if pdf file then add self-uri tag if parser.self_uri(soup) is not None and len( parser.self_uri(soup)) == 0: for filename in new_filenames: if filename.endswith('.pdf'): root = self.add_self_uri_to_xml(doi_id, filename, root) # if ds.zip file is there, then add it to the xml poa_ds_zip_file = None for f in new_filenames: if f.endswith('.zip'): poa_ds_zip_file = f if poa_ds_zip_file: root = self.add_poa_ds_zip_to_xml(doi_id, poa_ds_zip_file, root) # Start the file output reparsed_string = xmlio.output(root, type=None, doctype_dict=doctype_dict) # Remove extra whitespace here for PoA articles to clean up and one VoR file too reparsed_string = reparsed_string.replace("\n", '').replace("\t", '') f = open(xml_file, 'wb') f.write(reparsed_string) f.close()
def convert_xml(self, doi_id, xml_file, filenames, new_filenames): # Register namespaces xmlio.register_xmlns() root, doctype_dict = xmlio.parse(xml_file, return_doctype_dict=True) soup = self.article_soup(xml_file) if parser.is_poa(soup): # Capitalise subject group values in article categories root = self.subject_group_convert_in_xml(root) pub_date = None if parser.pub_date(soup) is None: # add the published date to the XML pub_date = self.get_pub_date_if_missing(doi_id) root = self.add_pub_date_to_xml(doi_id, pub_date, root) else: pub_date = parser.pub_date(soup) if parser.volume(soup) is None: # Get the pub-date year to calculate the volume year = pub_date[0] volume = year - 2011 self.add_volume_to_xml(doi_id, volume, root) # set the article-id, to overwrite the v2, v3 value if present root = self.set_article_id_xml(doi_id, root) # if pdf file then add self-uri tag if parser.self_uri(soup) is not None and len(parser.self_uri(soup)) == 0: for filename in new_filenames: if filename.endswith('.pdf'): root = self.add_self_uri_to_xml(doi_id, filename, root) # if ds.zip file is there, then add it to the xml poa_ds_zip_file = None for f in new_filenames: if f.endswith('.zip'): poa_ds_zip_file = f if poa_ds_zip_file: root = self.add_poa_ds_zip_to_xml(doi_id, poa_ds_zip_file, root) # Start the file output reparsed_string = xmlio.output(root, type=None, doctype_dict=doctype_dict) # Remove extra whitespace here for PoA articles to clean up and one VoR file too reparsed_string = reparsed_string.replace("\n", '').replace("\t", '') f = open(xml_file, 'wb') f.write(reparsed_string) f.close()
def items_to_match(self, soup): graphics = parser.graphics(soup) media = parser.media(soup) self_uri = parser.self_uri(soup) inline_graphics = parser.inline_graphics(soup) return graphics + media + self_uri + inline_graphics