示例#1
0
    def convert_xml(self, doi_id, xml_file, filenames, new_filenames):

        # Register namespaces
        xmlio.register_xmlns()

        root, doctype_dict = xmlio.parse(xml_file, return_doctype_dict=True)

        soup = self.article_soup(xml_file)

        if parser.is_poa(soup):
            # Capitalise subject group values in article categories
            root = self.subject_group_convert_in_xml(root)

            pub_date = None
            if parser.pub_date(soup) is None:
                # add the published date to the XML
                pub_date = self.get_pub_date_if_missing(doi_id)
                root = self.add_pub_date_to_xml(doi_id, pub_date, root)
            else:
                pub_date = parser.pub_date(soup)

            if parser.volume(soup) is None:
                # Get the pub-date year to calculate the volume
                year = pub_date[0]
                volume = year - 2011
                self.add_volume_to_xml(doi_id, volume, root)

            # set the article-id, to overwrite the v2, v3 value if present
            root = self.set_article_id_xml(doi_id, root)

            # if pdf file then add self-uri tag
            if parser.self_uri(soup) is not None and len(
                    parser.self_uri(soup)) == 0:
                for filename in new_filenames:
                    if filename.endswith('.pdf'):
                        root = self.add_self_uri_to_xml(doi_id, filename, root)

            # if ds.zip file is there, then add it to the xml
            poa_ds_zip_file = None
            for f in new_filenames:
                if f.endswith('.zip'):
                    poa_ds_zip_file = f
            if poa_ds_zip_file:
                root = self.add_poa_ds_zip_to_xml(doi_id, poa_ds_zip_file,
                                                  root)

        # Start the file output
        reparsed_string = xmlio.output(root,
                                       type=None,
                                       doctype_dict=doctype_dict)

        # Remove extra whitespace here for PoA articles to clean up and one VoR file too
        reparsed_string = reparsed_string.replace("\n", '').replace("\t", '')

        f = open(xml_file, 'wb')
        f.write(reparsed_string)
        f.close()
    def convert_xml(self, doi_id, xml_file, filenames, new_filenames):

        # Register namespaces
        xmlio.register_xmlns()

        root, doctype_dict = xmlio.parse(xml_file, return_doctype_dict=True)

        soup = self.article_soup(xml_file)

        if parser.is_poa(soup):
            # Capitalise subject group values in article categories
            root = self.subject_group_convert_in_xml(root)

            pub_date = None
            if parser.pub_date(soup) is None:
                # add the published date to the XML
                pub_date = self.get_pub_date_if_missing(doi_id)
                root = self.add_pub_date_to_xml(doi_id, pub_date, root)
            else:
                pub_date = parser.pub_date(soup)

            if parser.volume(soup) is None:
                # Get the pub-date year to calculate the volume
                year = pub_date[0]
                volume = year - 2011
                self.add_volume_to_xml(doi_id, volume, root)

            # set the article-id, to overwrite the v2, v3 value if present
            root = self.set_article_id_xml(doi_id, root)

            # if pdf file then add self-uri tag
            if parser.self_uri(soup) is not None and len(parser.self_uri(soup)) == 0:
                for filename in new_filenames:
                    if filename.endswith('.pdf'):
                        root = self.add_self_uri_to_xml(doi_id, filename, root)

            # if ds.zip file is there, then add it to the xml
            poa_ds_zip_file = None
            for f in new_filenames:
                if f.endswith('.zip'):
                    poa_ds_zip_file = f
            if poa_ds_zip_file:
                root = self.add_poa_ds_zip_to_xml(doi_id, poa_ds_zip_file, root)


        # Start the file output
        reparsed_string = xmlio.output(root, type=None, doctype_dict=doctype_dict)

        # Remove extra whitespace here for PoA articles to clean up and one VoR file too
        reparsed_string = reparsed_string.replace("\n", '').replace("\t", '')

        f = open(xml_file, 'wb')
        f.write(reparsed_string)
        f.close()
 def items_to_match(self, soup):
     graphics = parser.graphics(soup)
     media = parser.media(soup)
     self_uri = parser.self_uri(soup)
     inline_graphics = parser.inline_graphics(soup)
     return graphics + media + self_uri + inline_graphics