예제 #1
0
    def __init__(self, collection=False, title=''):
        self.collection = collection

        #Special navigation structures: List of Equations/Figures/Tables
        self.equations_list = []
        self.figures_list = []
        self.tables_list = []

        self.article = None
        self.article_doi = None
        self.all_dois = []  # Used to create UID

        #These are the limited forms of metadata that might make it in to the
        #navigation document. Both are used for EPUB2, only the title is used
        #for EPUB3
        self.title = title
        self.contributors = OrderedSet()

        #The nav structure is a list of navpoint trees. Each navpoint may have
        #children navpoints. This structure will be converted to the appropriate
        #xml/xhtml structure and written to file when required.
        self.nav = []
        self.nav_depth = 0

        self._play_order = 0
        self._auto_id = 0
예제 #2
0
    def __init__(self, collection=False, title=''):
        self.collection = collection

        #Special navigation structures: List of Equations/Figures/Tables
        self.equations_list = []
        self.figures_list = []
        self.tables_list = []

        self.article = None
        self.article_doi = None
        self.all_dois = []  # Used to create UID

        #These are the limited forms of metadata that might make it in to the
        #navigation document. Both are used for EPUB2, only the title is used
        #for EPUB3
        self.title = title
        self.contributors = OrderedSet()

        #The nav structure is a list of navpoint trees. Each navpoint may have
        #children navpoints. This structure will be converted to the appropriate
        #xml/xhtml structure and written to file when required.
        self.nav = []
        self.nav_depth = 0

        self._play_order = 0
        self._auto_id = 0
예제 #3
0
    def __init__(self, collection=False, title=''):
        self.collection = collection
        self.spine_list = []

        self.article = None
        self.article_doi = None

        self.all_dois = []  # Used to create unique id and rights in collections
        #self.all_articles = []

        #Metadata elements
        self.pub_id = None
        self.contributors = OrderedSet()      # 0+ Authors/Editors/Reviewers
        self.coverage = OrderedSet()          # 0+ Not used yet
        self.dates = OrderedSet()             # 0+ Publication date (probably)
        self.descriptions = OrderedSet()      # 0+ Long descriptions (abstracts)
        self.format = 'application/epub+zip'  # 1  Always epub
        self.languages = OrderedSet()         # 1+ All languages present in doc
        self.publishers = OrderedSet()        # 0+ All publishers of content
        self.relation = OrderedSet()          # 0+ Not used yet
        self.rights = OrderedSet()            # 1  License, details TBD
        self.rights_associations = {}         # Keeps track per-article
        self.source = OrderedSet()            # 0+ Not used yet
        self.subjects = OrderedSet()          # 0+ Subjects covered in doc
        self.title = None                     # 1  Title of publication
        self.type = 'text'                    # 1  Always text

        if self.collection:  # Collections receive assigned titles
            self.title = title
예제 #4
0
class Package(object):
    """
    The Package class
    """

    def __init__(self, collection=False, title=''):
        self.collection = collection
        self.spine_list = []

        self.article = None
        self.article_doi = None

        self.all_dois = []  # Used to create unique id and rights in collections
        #self.all_articles = []

        #Metadata elements
        self.pub_id = None
        self.contributors = OrderedSet()      # 0+ Authors/Editors/Reviewers
        self.coverage = OrderedSet()          # 0+ Not used yet
        self.dates = OrderedSet()             # 0+ Publication date (probably)
        self.descriptions = OrderedSet()      # 0+ Long descriptions (abstracts)
        self.format = 'application/epub+zip'  # 1  Always epub
        self.languages = OrderedSet()         # 1+ All languages present in doc
        self.publishers = OrderedSet()        # 0+ All publishers of content
        self.relation = OrderedSet()          # 0+ Not used yet
        self.rights = OrderedSet()            # 1  License, details TBD
        self.rights_associations = {}         # Keeps track per-article
        self.source = OrderedSet()            # 0+ Not used yet
        self.subjects = OrderedSet()          # 0+ Subjects covered in doc
        self.title = None                     # 1  Title of publication
        self.type = 'text'                    # 1  Always text

        if self.collection:  # Collections receive assigned titles
            self.title = title

    def process(self, article):
        """
        Ingests an article and processes it for metadata and elements to provide
        proper references in the EPUB spine.

        This method may only be called once unless the Package was instantiated
        in collection mode using ``Package(collection=True)``. It places entries
        in an internal spine list for the Main Content Document, the
        Bibliographic Content Document (if there are ref elements in Back), and
        the Tables Content Document (if there are table elements). It then
        employs the publisher specific methods for extracting article metadata
        using the article's publisher attribute (an instance of a Publisher
        class).

        Parameters
        ----------
        article : openaccess_epub.article.Article instance
            An article to be included in the EPUB, to be processed for metadata
            and appropriate content document references.
        """
        if self.article is not None and not self.collection:
            log.warning('Could not process additional article. Package only \
handles one article unless collection mode is set.')
            return False

        if article.publisher is None:
            log.error('''Package cannot be generated for an Article \
without a publisher!''')
            return
        self.article = article
        self.article_doi = self.article.doi.split('/')[1]
        self.all_dois.append(self.article.doi)

        #Analyze the article to add entries to the spine
        dash_doi = self.article_doi.replace('.', '-')

        #Entry for the main content document
        main_idref = 'main-{0}-xhtml'.format(dash_doi)
        self.spine_list.append(spine_item(main_idref, True))

        #Entry for the biblio content document
        biblio_idref = 'biblio-{0}-xhtml'.format(dash_doi)
        if self.article.root.xpath('./back/ref-list/ref'):
                self.spine_list.append(spine_item(biblio_idref, True))

        #Entry for the tables content document
        tables_idref = 'tables-{0}-xhtml'.format(dash_doi)
        if self.article.publisher.has_out_of_flow_tables():
            self.spine_list.append(spine_item(tables_idref, False))

        self.acquire_metadata()

    def acquire_metadata(self):
        """
        Handles the acquisition of metadata for both collection mode and single
        mode, uses the metadata methods belonging to the article's publisher
        attribute.
        """
        #For space economy
        publisher = self.article.publisher

        if self.collection:  # collection mode metadata gathering
            pass
        else:  # single mode metadata gathering
            self.pub_id = publisher.package_identifier()
            self.title = publisher.package_title()
            for date in publisher.package_date():
                self.dates.add(date)

        #Common metadata gathering
        for lang in publisher.package_language():
            self.languages.add(lang)  # languages
        for contributor in publisher.package_contributors():  # contributors
            self.contributors.add(contributor)
        self.publishers.add(publisher.package_publisher())  # publisher names
        desc = publisher.package_description()
        if desc is not None:
            self.descriptions.add(desc)
        for subj in publisher.package_subject():
            self.subjects.add(subj)  # subjects
        #Rights
        art_rights = publisher.package_rights()
        self.rights.add(art_rights)
        if art_rights not in self.rights_associations:
            self.rights_associations[art_rights] = [self.article.doi]
        else:
            self.rights_associations[art_rights].append(self.article.doi)

    def file_manifest(self, location):
        """
        An iterator through the files in a location which yields item elements
        suitable for insertion into the package manifest.
        """
        #Maps file extensions to mimetypes
        mimetypes = {'.jpg': 'image/jpeg',
                     '.jpeg': 'image/jpeg',
                     '.xml': 'application/xhtml+xml',
                     '.png': 'image/png',
                     '.css': 'text/css',
                     '.ncx': 'application/x-dtbncx+xml',
                     '.gif': 'image/gif',
                     '.tif': 'image/tif',
                     '.pdf': 'application/pdf',
                     '.xhtml': 'application/xhtml+xml',
                     '.ttf': 'application/vnd.ms-opentype',
                     '.otf': 'application/vnd.ms-opentype'}

        current_dir = os.getcwd()
        os.chdir(location)
        for dirpath, _dirnames, filenames in os.walk('.'):
            dirpath = dirpath[2:]  # A means to avoid dirpath prefix of './'
            for fn in filenames:
                fn_ext = os.path.splitext(fn)[-1]
                item = etree.Element('item')
                #Here we set three attributes: href, media-type, and id
                if not dirpath:
                    item.attrib['href'] = fn
                else:
                    item.attrib['href'] = '/'.join([dirpath, fn])
                item.attrib['media-type'] = mimetypes[fn_ext]
                #Special handling for common image types
                if fn_ext in ['.jpg', '.png', '.tif', '.jpeg']:
                    #the following lines assume we are using the convention
                    #where the article doi is prefixed by 'images-'
                    item.attrib['id'] = '-'.join([dirpath[7:],
                                                  fn.replace('.', '-')])
                else:
                    item.attrib['id'] = fn.replace('.', '-')
                yield item
        os.chdir(current_dir)

    def make_element(self, tagname, doc, attrs={}, text=''):
        new_element = etree.Element(self.ns_rectify(tagname, doc))
        for kwd, val in attrs.items():
            if val is None:  # None values will not become attributes
                continue
            new_element.attrib[self.ns_rectify(kwd, doc)] = val
        new_element.text = text
        return new_element

    def ns_rectify(self, tagname, document):
        if ':' not in tagname:
            return tagname
        else:
            ns, tag = tagname.split(':')
            return '{' + document.getroot().nsmap[ns] + '}' + tag

    def _init_package_doc(self, version):
        root = etree.XML('''\
<?xml version="1.0"?>
<package
   xmlns="http://www.idpf.org/2007/opf"
   xmlns:dc="http://purl.org/dc/elements/1.1/"
   xmlns:opf="http://www.idpf.org/2007/opf"
   xmlns:dcterms="http://purl.org/dc/terms/"
   version="{0}"
   unique-identifier="pub-identifier">\
</package>'''.format(version))
        document = etree.ElementTree(root)
        return document

    def render_EPUB2(self, location):
        log.info('Rendering Package Document for EPUB2')
        document = self._init_package_doc(version='2.0')
        package = document.getroot()

        #Make the Metadata
        metadata = etree.SubElement(package, 'metadata')

        #Metadata: Identifier
        if not self.collection:  # Identifier for single article
            ident = self.make_element('dc:identifier',
                                      document,
                                      {'id': 'pub-identifier',
                                       'opf:scheme': self.pub_id.scheme},
                                      self.pub_id.value)
            metadata.append(ident)
        else:  # Identifier for collection
            ident = self.make_element('dc:identifier',
                                      document,
                                      {'id': 'pub-identifier',
                                       'opf:scheme': 'DOI'},
                                      ','.join(self.all_dois))
            metadata.append(ident)

        #Metadata: Title
        #Divergence between single articles and collections for titles is
        #handled during initiation and selective metadata acquisition, not here
        title = self.make_element('dc:title', document, text=self.title)
        metadata.append(title)

        #Metadata: Languages
        for lang in self.languages:
            lang_el = self.make_element('dc:language', document, text=lang)
            metadata.append(lang_el)
        #So here's the deal about creators/contributors:
        #The EPUB2 OPF spec indicates a distinction between primary authors
        #(contained in dc:creator) and secondary authors (contained in
        #dc:contributor, along with all the other options in
        # http://www.idpf.org/epub/20/spec/OPF_2.0.1_draft.htm#TOC2.2.6). As far
        #as I can think there is no real use case in academic articles for
        #<dc:contributor role="aut">... We'll just make all contributors with
        #the 'aut' role as <dc:creator>s
        for contrib in self.contributors:
            tag = 'dc:creator' if contrib.role == 'aut' else 'dc:contributor'
            metadata.append(self.make_element(tag,
                                              document,
                                              {'opf:role': contrib.role,
                                               'opf:file-as': contrib.file_as},
                                              contrib.name))

        #Metadata: Descriptions
        for description in self.descriptions:
            metadata.append(self.make_element('dc:description',
                                               document,
                                               text=description))

        #Metadata: Subjects
        for subject in self.subjects:
            metadata.append(self.make_element('dc:subject',
                                              document,
                                              text=subject))

        #Metadata: Format
        metadata.append(self.make_element('dc:format',
                                          document,
                                          text=self.format))

        #Metadata: Publishers
        for publisher in self.publishers:
            metadata.append(self.make_element('dc:publisher',
                                              document,
                                              text=publisher))

        #Metadata: Dates
        for date in self.dates:
            #I use str coercion just to be safe, in case someone returns ints
            date_text = str(date.year)
            if date.month:
                date_text = '-'.join([date_text, str(date.month)])
                if date.day:
                    date_text = '-'.join([date_text, str(date.day)])
            metadata.append(self.make_element('dc:date',
                                              document,
                                              {'opf:event': date.event},
                                              date_text))

        #Metadata: Rights
        if self.collection:
            if len(self.rights) == 1:  # Only one license string present
                rights_text = '''\
All articles in this collection published according to the following license:
'''
                rights_text = ''.join([rights_text, self.rights.pop()])
            else:  # More than one, we need to refer to rights_associations
                rights_text = '''\
Articles in this collection were published according to different licenses. Each
unique license will be listed below, preceded by every article DOI to which it
applies.'''
                for lic, doi_list in self.rights_associations.items():
                    doi_line = ','.join(doi_list)
                    rights_text = '\n'.join([rights_text, doi_line, lic])
                metadata.append(self.make_element('dc:rights',
                                                  document,
                                                  text=rights_text))

        else:
            metadata.append(self.make_element('dc:rights',
                                              document,
                                              text=self.rights.pop()))

        #Not Implemented Metadata: Source, Type, Coverage, Relation

        #Make the Manifest
        manifest = etree.SubElement(package, 'manifest')
        for item in self.file_manifest(os.path.join(location, 'EPUB')):
            if item.attrib['id'] == 'toc-ncx':
                item.attrib['id'] = 'ncx'  # Special id for toc.ncx
            manifest.append(item)

        #Make the Spine
        spine = etree.SubElement(package, 'spine')
        spine.attrib['toc'] = 'ncx'
        for item in self.spine_list:
            itemref = etree.SubElement(spine, 'itemref')
            itemref.attrib['idref'] = item.idref
            itemref.attrib['linear'] = 'yes' if item.linear else 'no'

        with open(os.path.join(location, 'EPUB', 'package.opf'), 'wb') as output:
            output.write(etree.tostring(document, encoding='utf-8', pretty_print=True))

    def render_EPUB3(self, location):
        log.info('Rendering Package Document for EPUB3')
        document = self._init_package_doc(version='3.0')
        package = document.getroot()

        #Make the Metadata
        metadata = etree.SubElement(package, 'metadata')

        #Metadata: Identifier
        today = datetime.date.today().strftime('%Y.%m.%d')
        if not self.collection:  # Identifier for single article
            ident = self.make_element('dc:identifier',
                                      document,
                                      {'id': 'pub-identifier'},
                                      '.'.join([self.pub_id.value, today]))
            metadata.append(ident)
        else:  # Identifier for collection
            ident = self.make_element('dc:identifier',
                                      document,
                                      {'id': 'pub-identifier'},
                                      ','.join(self.all_dois) + '.' + today)
            metadata.append(ident)
        #Metadata: Identifier Refinement
        meta = self.make_element('meta',
                                 document,
                                 {'refines': '#pub-identifier',
                                  'property': 'identifier-type',
                                  'scheme': 'onix:codelist5'})
        if self.collection:  # Collections are always DOIs currently
            meta.text = '06'
            metadata.append(meta)
        else:
            if self.pub_id.scheme is not None:
                if self.pub_id.scheme == 'DOI':
                    meta.text = '06'
                    metadata.append(meta)
                else:  # We could do an ONIXlist lookup map here
                    raise ValueError('Unhandled id scheme!')

        #Metadata: Title
        #Divergence between single articles and collections for titles is
        #handled during initiation and selective metadata acquisition, not here
        title = self.make_element('dc:title',
                                  document,
                                  {'id': 'pub-title'},
                                  text=self.title)
        metadata.append(title)

        #Metadata: Title Refinement
        meta = self.make_element('meta',
                                 document,
                                 {'refines': '#pub-title',
                                  'property': 'title-type'},
                                 'main')
        metadata.append(meta)

        #Metadata: Languages
        for lang in self.languages:
            lang_el = self.make_element('dc:language', document, text=lang)
            metadata.append(lang_el)

        #Metadata: Contributors/Creators
        #So here's the deal about creators/contributors:
        #The EPUB2 OPF spec indicates a distinction between primary authors
        #(contained in dc:creator) and secondary authors (contained in
        #dc:contributor, along with all the other options in
        # http://www.idpf.org/epub/20/spec/OPF_2.0.1_draft.htm#TOC2.2.6). As far
        #as I can think there is no real use case in academic articles for
        #<dc:contributor role="aut">... We'll just make all contributors with
        #the 'aut' role as <dc:creator>s
        contrib_count = 0
        for contrib in self.contributors:
            tag = 'dc:creator' if contrib.role == 'aut' else 'dc:contributor'
            contrib_id = 'contrib{0}'.format(contrib_count)
            metadata.append(self.make_element(tag,
                                              document,
                                              {'id': contrib_id},
                                              text=contrib.name))

            #Metadata: Contributors/Creators Refinement
            #MARC Relators: http://www.loc.gov/marc/relators/relaterm.html
            #MARC Relators: http://www.loc.gov/marc/relators/relacode.html
            role_meta = self.make_element('meta',
                                          document,
                                          {'refines': '#' + contrib_id,
                                           'property': 'role',
                                           'scheme': 'marc:relators'})
            if contrib.role is not None:
                role_meta.text = contrib.role
                metadata.append(role_meta)

            file_as_meta = self.make_element('meta',
                                             document,
                                             {'refines': '#' + contrib_id,
                                              'property': 'file-as'})

            if contrib.file_as is not None:
                file_as_meta.text = contrib.file_as
                metadata.append(file_as_meta)
            contrib_count += 1

        #Metadata: Descriptions
        for description in self.descriptions:
            metadata.append(self.make_element('dc:description',
                                               document,
                                               text=description))

        #Metadata: Subjects
        for subject in self.subjects:
            metadata.append(self.make_element('dc:subject',
                                              document,
                                              text=subject))

        #Metadata: Format
        metadata.append(self.make_element('dc:format',
                                          document,
                                          text=self.format))

        #Metadata: Publishers
        for publisher in self.publishers:
            metadata.append(self.make_element('dc:publisher',
                                              document,
                                              text=publisher))

        #Metadata: Dates
        #EPUB3 differs significantly from EPUB2, only one dc:date is allowed
        #and it must be the date of EPUB publication
        #Must also be of proper format: http://www.w3.org/TR/NOTE-datetime
        simple_date = datetime.date.today().strftime('%Y-%m-%d')
        metadata.append(self.make_element('dc:date',
                                          document,
                                          {'id': 'pub-date'},
                                          simple_date))
        #Must have meta with dcterms:modified
        now = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
        metadata.append(self.make_element('meta',
                                          document,
                                          {'property': 'dcterms:modified'},
                                          now))
        #Metadata: Dates Refinement
        #values are dateAccepted, dateCopyrighted, dateSubmitted
        accepted = self.make_element('meta',
                                     document,
                                     {'refines': '#pub-date',
                                      'property': 'dcterms:dateAccepted'})
        copyrighted = self.make_element('meta',
                                        document,
                                        {'refines': '#pub-date',
                                         'property': 'dcterms:dateCopyrighted'})
        submitted = self.make_element('meta',
                                      document,
                                      {'refines': '#pub-date',
                                       'property': 'dcterms:dateSubmitted'})

        def date_text(date):
            text = str(date.year)
            if date.month:
                text = '-'.join([text, str(date.month)])
                if date.day:
                    text = '-'.join([text, str(date.day)])
            return text

        for date in self.dates:
            if date.event == 'accepted':
                accepted.text = date_text(date)
                metadata.append(accepted)
            elif date.event == 'copyrighted':
                copyrighted.text = date_text(date)
                metadata.append(copyrighted)
            elif date.event == 'submitted':
                submitted.text = date_text(date)
                metadata.append(submitted)

        #Metadata: Rights
        if self.collection:
            if len(self.rights) == 1:  # Only one license string present
                rights_text = '''\
All articles in this collection published according to the following license:
'''
                rights_text = ''.join([rights_text, self.rights.pop()])
            else:  # More than one, we need to refer to rights_associations
                rights_text = '''\
Articles in this collection were published according to different licenses. Each
unique license will be listed below, preceded by every article DOI to which it
applies.'''
                for lic, doi_list in self.rights_associations.items():
                    doi_line = ','.join(doi_list)
                    rights_text = '\n'.join([rights_text, doi_line, lic])
            metadata.append(self.make_element('dc:rights',
                                              document,
                                              text=rights_text))

        else:
            metadata.append(self.make_element('dc:rights',
                                              document,
                                              text=self.rights.pop()))

        #Not Implemented Metadata: Source, Type, Coverage, Relation

        #Make the Manifest
        manifest = etree.SubElement(package, 'manifest')
        for item in self.file_manifest(os.path.join(location, 'EPUB')):
            if item.attrib['id'] == 'nav-xhtml':
                item.attrib['id'] = 'htmltoc'  # Special id for nav.xhtml
                item.attrib['properties'] = 'nav'
            if item.attrib['id'] == 'toc-ncx':
                item.attrib['id'] = 'ncx'  # Special id for toc.ncx
            manifest.append(item)

        #Make the Spine
        spine = etree.SubElement(package, 'spine')
        for item in self.spine_list:
            itemref = etree.SubElement(spine, 'itemref')
            itemref.attrib['idref'] = item.idref
            itemref.attrib['linear'] = 'yes' if item.linear else 'no'

        with open(os.path.join(location, 'EPUB', 'package.opf'), 'wb') as output:
            output.write(etree.tostring(document, encoding='utf-8', pretty_print=True))
예제 #5
0
    def __init__(self, collection=False, title=''):
        self.collection = collection
        self.spine_list = []

        self.article = None
        self.article_doi = None

        self.all_dois = [
        ]  # Used to create unique id and rights in collections
        #self.all_articles = []

        #Metadata elements
        self.pub_id = None
        self.contributors = OrderedSet()  # 0+ Authors/Editors/Reviewers
        self.coverage = OrderedSet()  # 0+ Not used yet
        self.dates = OrderedSet()  # 0+ Publication date (probably)
        self.descriptions = OrderedSet()  # 0+ Long descriptions (abstracts)
        self.format = 'application/epub+zip'  # 1  Always epub
        self.languages = OrderedSet()  # 1+ All languages present in doc
        self.publishers = OrderedSet()  # 0+ All publishers of content
        self.relation = OrderedSet()  # 0+ Not used yet
        self.rights = OrderedSet()  # 1  License, details TBD
        self.rights_associations = {}  # Keeps track per-article
        self.source = OrderedSet()  # 0+ Not used yet
        self.subjects = OrderedSet()  # 0+ Subjects covered in doc
        self.title = None  # 1  Title of publication
        self.type = 'text'  # 1  Always text

        if self.collection:  # Collections receive assigned titles
            self.title = title
예제 #6
0
class Package(object):
    """
    The Package class
    """
    def __init__(self, collection=False, title=''):
        self.collection = collection
        self.spine_list = []

        self.article = None
        self.article_doi = None

        self.all_dois = [
        ]  # Used to create unique id and rights in collections
        #self.all_articles = []

        #Metadata elements
        self.pub_id = None
        self.contributors = OrderedSet()  # 0+ Authors/Editors/Reviewers
        self.coverage = OrderedSet()  # 0+ Not used yet
        self.dates = OrderedSet()  # 0+ Publication date (probably)
        self.descriptions = OrderedSet()  # 0+ Long descriptions (abstracts)
        self.format = 'application/epub+zip'  # 1  Always epub
        self.languages = OrderedSet()  # 1+ All languages present in doc
        self.publishers = OrderedSet()  # 0+ All publishers of content
        self.relation = OrderedSet()  # 0+ Not used yet
        self.rights = OrderedSet()  # 1  License, details TBD
        self.rights_associations = {}  # Keeps track per-article
        self.source = OrderedSet()  # 0+ Not used yet
        self.subjects = OrderedSet()  # 0+ Subjects covered in doc
        self.title = None  # 1  Title of publication
        self.type = 'text'  # 1  Always text

        if self.collection:  # Collections receive assigned titles
            self.title = title

    def process(self, article):
        """
        Ingests an article and processes it for metadata and elements to provide
        proper references in the EPUB spine.

        This method may only be called once unless the Package was instantiated
        in collection mode using ``Package(collection=True)``. It places entries
        in an internal spine list for the Main Content Document, the
        Bibliographic Content Document (if there are ref elements in Back), and
        the Tables Content Document (if there are table elements). It then
        employs the publisher specific methods for extracting article metadata
        using the article's publisher attribute (an instance of a Publisher
        class).

        Parameters
        ----------
        article : openaccess_epub.article.Article instance
            An article to be included in the EPUB, to be processed for metadata
            and appropriate content document references.
        """
        if self.article is not None and not self.collection:
            log.warning('Could not process additional article. Package only \
handles one article unless collection mode is set.')
            return False

        if article.publisher is None:
            log.error('''Package cannot be generated for an Article \
without a publisher!''')
            return
        self.article = article
        self.article_doi = self.article.doi.split('/')[1]
        self.all_dois.append(self.article.doi)

        #Analyze the article to add entries to the spine
        dash_doi = self.article_doi.replace('.', '-')

        #Entry for the main content document
        main_idref = 'main-{0}-xhtml'.format(dash_doi)
        self.spine_list.append(spine_item(main_idref, True))

        #Entry for the biblio content document
        biblio_idref = 'biblio-{0}-xhtml'.format(dash_doi)
        if self.article.root.xpath('./back/ref-list/ref'):
            self.spine_list.append(spine_item(biblio_idref, True))

        #Entry for the tables content document
        tables_idref = 'tables-{0}-xhtml'.format(dash_doi)
        if self.article.publisher.has_out_of_flow_tables():
            self.spine_list.append(spine_item(tables_idref, False))

        self.acquire_metadata()

    def acquire_metadata(self):
        """
        Handles the acquisition of metadata for both collection mode and single
        mode, uses the metadata methods belonging to the article's publisher
        attribute.
        """
        #For space economy
        publisher = self.article.publisher

        if self.collection:  # collection mode metadata gathering
            pass
        else:  # single mode metadata gathering
            self.pub_id = publisher.package_identifier()
            self.title = publisher.package_title()
            for date in publisher.package_date():
                self.dates.add(date)

        #Common metadata gathering
        for lang in publisher.package_language():
            self.languages.add(lang)  # languages
        for contributor in publisher.package_contributors():  # contributors
            self.contributors.add(contributor)
        self.publishers.add(publisher.package_publisher())  # publisher names
        desc = publisher.package_description()
        if desc is not None:
            self.descriptions.add(desc)
        for subj in publisher.package_subject():
            self.subjects.add(subj)  # subjects
        #Rights
        art_rights = publisher.package_rights()
        self.rights.add(art_rights)
        if art_rights not in self.rights_associations:
            self.rights_associations[art_rights] = [self.article.doi]
        else:
            self.rights_associations[art_rights].append(self.article.doi)

    def file_manifest(self, location):
        """
        An iterator through the files in a location which yields item elements
        suitable for insertion into the package manifest.
        """
        #Maps file extensions to mimetypes
        mimetypes = {
            '.jpg': 'image/jpeg',
            '.jpeg': 'image/jpeg',
            '.xml': 'application/xhtml+xml',
            '.png': 'image/png',
            '.css': 'text/css',
            '.ncx': 'application/x-dtbncx+xml',
            '.gif': 'image/gif',
            '.tif': 'image/tif',
            '.pdf': 'application/pdf',
            '.xhtml': 'application/xhtml+xml',
            '.ttf': 'application/vnd.ms-opentype',
            '.otf': 'application/vnd.ms-opentype'
        }

        current_dir = os.getcwd()
        os.chdir(location)
        for dirpath, _dirnames, filenames in os.walk('.'):
            dirpath = dirpath[2:]  # A means to avoid dirpath prefix of './'
            for fn in filenames:
                fn_ext = os.path.splitext(fn)[-1]
                item = etree.Element('item')
                #Here we set three attributes: href, media-type, and id
                if not dirpath:
                    item.attrib['href'] = fn
                else:
                    item.attrib['href'] = '/'.join([dirpath, fn])
                item.attrib['media-type'] = mimetypes[fn_ext]
                #Special handling for common image types
                if fn_ext in ['.jpg', '.png', '.tif', '.jpeg']:
                    #the following lines assume we are using the convention
                    #where the article doi is prefixed by 'images-'
                    item.attrib['id'] = '-'.join(
                        [dirpath[7:], fn.replace('.', '-')])
                else:
                    item.attrib['id'] = fn.replace('.', '-')
                yield item
        os.chdir(current_dir)

    def make_element(self, tagname, doc, attrs={}, text=''):
        new_element = etree.Element(self.ns_rectify(tagname, doc))
        for kwd, val in list(attrs.items()):
            if val is None:  # None values will not become attributes
                continue
            new_element.attrib[self.ns_rectify(kwd, doc)] = val
        new_element.text = text
        return new_element

    def ns_rectify(self, tagname, document):
        if ':' not in tagname:
            return tagname
        else:
            ns, tag = tagname.split(':')
            return '{' + document.getroot().nsmap[ns] + '}' + tag

    def _init_package_doc(self, version):
        root = etree.XML('''\
<?xml version="1.0"?>
<package
   xmlns="http://www.idpf.org/2007/opf"
   xmlns:dc="http://purl.org/dc/elements/1.1/"
   xmlns:opf="http://www.idpf.org/2007/opf"
   xmlns:dcterms="http://purl.org/dc/terms/"
   version="{0}"
   unique-identifier="pub-identifier">\
</package>'''.format(version))
        document = etree.ElementTree(root)
        return document

    def render_EPUB2(self, location):
        log.info('Rendering Package Document for EPUB2')
        document = self._init_package_doc(version='2.0')
        package = document.getroot()

        #Make the Metadata
        metadata = etree.SubElement(package, 'metadata')

        #Metadata: Identifier
        if not self.collection:  # Identifier for single article
            ident = self.make_element('dc:identifier', document, {
                'id': 'pub-identifier',
                'opf:scheme': self.pub_id.scheme
            }, self.pub_id.value)
            metadata.append(ident)
        else:  # Identifier for collection
            ident = self.make_element('dc:identifier', document, {
                'id': 'pub-identifier',
                'opf:scheme': 'DOI'
            }, ','.join(self.all_dois))
            metadata.append(ident)

        #Metadata: Title
        #Divergence between single articles and collections for titles is
        #handled during initiation and selective metadata acquisition, not here
        title = self.make_element('dc:title', document, text=self.title)
        metadata.append(title)

        #Metadata: Languages
        for lang in self.languages:
            lang_el = self.make_element('dc:language', document, text=lang)
            metadata.append(lang_el)
        #So here's the deal about creators/contributors:
        #The EPUB2 OPF spec indicates a distinction between primary authors
        #(contained in dc:creator) and secondary authors (contained in
        #dc:contributor, along with all the other options in
        # http://www.idpf.org/epub/20/spec/OPF_2.0.1_draft.htm#TOC2.2.6). As far
        #as I can think there is no real use case in academic articles for
        #<dc:contributor role="aut">... We'll just make all contributors with
        #the 'aut' role as <dc:creator>s
        for contrib in self.contributors:
            tag = 'dc:creator' if contrib.role == 'aut' else 'dc:contributor'
            metadata.append(
                self.make_element(tag, document, {
                    'opf:role': contrib.role,
                    'opf:file-as': contrib.file_as
                }, contrib.name))

        #Metadata: Descriptions
        for description in self.descriptions:
            metadata.append(
                self.make_element('dc:description', document,
                                  text=description))

        #Metadata: Subjects
        for subject in self.subjects:
            metadata.append(
                self.make_element('dc:subject', document, text=subject))

        #Metadata: Format
        metadata.append(
            self.make_element('dc:format', document, text=self.format))

        #Metadata: Publishers
        for publisher in self.publishers:
            metadata.append(
                self.make_element('dc:publisher', document, text=publisher))

        #Metadata: Dates
        for date in self.dates:
            #I use str coercion just to be safe, in case someone returns ints
            date_text = str(date.year)
            if date.month:
                date_text = '-'.join([date_text, str(date.month)])
                if date.day:
                    date_text = '-'.join([date_text, str(date.day)])
            metadata.append(
                self.make_element('dc:date', document,
                                  {'opf:event': date.event}, date_text))

        #Metadata: Rights
        if self.collection:
            if len(self.rights) == 1:  # Only one license string present
                rights_text = '''\
All articles in this collection published according to the following license:
'''
                rights_text = ''.join([rights_text, self.rights.pop()])
            else:  # More than one, we need to refer to rights_associations
                rights_text = '''\
Articles in this collection were published according to different licenses. Each
unique license will be listed below, preceded by every article DOI to which it
applies.'''
                for lic, doi_list in list(self.rights_associations.items()):
                    doi_line = ','.join(doi_list)
                    rights_text = '\n'.join([rights_text, doi_line, lic])
                metadata.append(
                    self.make_element('dc:rights', document, text=rights_text))

        else:
            metadata.append(
                self.make_element('dc:rights',
                                  document,
                                  text=self.rights.pop()))

        #Not Implemented Metadata: Source, Type, Coverage, Relation

        #Make the Manifest
        manifest = etree.SubElement(package, 'manifest')
        for item in self.file_manifest(os.path.join(location, 'EPUB')):
            if item.attrib['id'] == 'toc-ncx':
                item.attrib['id'] = 'ncx'  # Special id for toc.ncx
            manifest.append(item)

        #Make the Spine
        spine = etree.SubElement(package, 'spine')
        spine.attrib['toc'] = 'ncx'
        for item in self.spine_list:
            itemref = etree.SubElement(spine, 'itemref')
            itemref.attrib['idref'] = item.idref
            itemref.attrib['linear'] = 'yes' if item.linear else 'no'

        with open(os.path.join(location, 'EPUB', 'package.opf'),
                  'wb') as output:
            output.write(
                etree.tostring(document, encoding='utf-8', pretty_print=True))

    def render_EPUB3(self, location):
        log.info('Rendering Package Document for EPUB3')
        document = self._init_package_doc(version='3.0')
        package = document.getroot()

        #Make the Metadata
        metadata = etree.SubElement(package, 'metadata')

        #Metadata: Identifier
        today = datetime.date.today().strftime('%Y.%m.%d')
        if not self.collection:  # Identifier for single article
            ident = self.make_element('dc:identifier', document,
                                      {'id': 'pub-identifier'},
                                      '.'.join([self.pub_id.value, today]))
            metadata.append(ident)
        else:  # Identifier for collection
            ident = self.make_element('dc:identifier', document,
                                      {'id': 'pub-identifier'},
                                      ','.join(self.all_dois) + '.' + today)
            metadata.append(ident)
        #Metadata: Identifier Refinement
        meta = self.make_element(
            'meta', document, {
                'refines': '#pub-identifier',
                'property': 'identifier-type',
                'scheme': 'onix:codelist5'
            })
        if self.collection:  # Collections are always DOIs currently
            meta.text = '06'
            metadata.append(meta)
        else:
            if self.pub_id.scheme is not None:
                if self.pub_id.scheme == 'DOI':
                    meta.text = '06'
                    metadata.append(meta)
                else:  # We could do an ONIXlist lookup map here
                    raise ValueError('Unhandled id scheme!')

        #Metadata: Title
        #Divergence between single articles and collections for titles is
        #handled during initiation and selective metadata acquisition, not here
        title = self.make_element('dc:title',
                                  document, {'id': 'pub-title'},
                                  text=self.title)
        metadata.append(title)

        #Metadata: Title Refinement
        meta = self.make_element('meta', document, {
            'refines': '#pub-title',
            'property': 'title-type'
        }, 'main')
        metadata.append(meta)

        #Metadata: Languages
        for lang in self.languages:
            lang_el = self.make_element('dc:language', document, text=lang)
            metadata.append(lang_el)

        #Metadata: Contributors/Creators
        #So here's the deal about creators/contributors:
        #The EPUB2 OPF spec indicates a distinction between primary authors
        #(contained in dc:creator) and secondary authors (contained in
        #dc:contributor, along with all the other options in
        # http://www.idpf.org/epub/20/spec/OPF_2.0.1_draft.htm#TOC2.2.6). As far
        #as I can think there is no real use case in academic articles for
        #<dc:contributor role="aut">... We'll just make all contributors with
        #the 'aut' role as <dc:creator>s
        contrib_count = 0
        for contrib in self.contributors:
            tag = 'dc:creator' if contrib.role == 'aut' else 'dc:contributor'
            contrib_id = 'contrib{0}'.format(contrib_count)
            metadata.append(
                self.make_element(tag,
                                  document, {'id': contrib_id},
                                  text=contrib.name))

            #Metadata: Contributors/Creators Refinement
            #MARC Relators: http://www.loc.gov/marc/relators/relaterm.html
            #MARC Relators: http://www.loc.gov/marc/relators/relacode.html
            role_meta = self.make_element(
                'meta', document, {
                    'refines': '#' + contrib_id,
                    'property': 'role',
                    'scheme': 'marc:relators'
                })
            if contrib.role is not None:
                role_meta.text = contrib.role
                metadata.append(role_meta)

            file_as_meta = self.make_element('meta', document, {
                'refines': '#' + contrib_id,
                'property': 'file-as'
            })

            if contrib.file_as is not None:
                file_as_meta.text = contrib.file_as
                metadata.append(file_as_meta)
            contrib_count += 1

        #Metadata: Descriptions
        for description in self.descriptions:
            metadata.append(
                self.make_element('dc:description', document,
                                  text=description))

        #Metadata: Subjects
        for subject in self.subjects:
            metadata.append(
                self.make_element('dc:subject', document, text=subject))

        #Metadata: Format
        metadata.append(
            self.make_element('dc:format', document, text=self.format))

        #Metadata: Publishers
        for publisher in self.publishers:
            metadata.append(
                self.make_element('dc:publisher', document, text=publisher))

        #Metadata: Dates
        #EPUB3 differs significantly from EPUB2, only one dc:date is allowed
        #and it must be the date of EPUB publication
        #Must also be of proper format: http://www.w3.org/TR/NOTE-datetime
        simple_date = datetime.date.today().strftime('%Y-%m-%d')
        metadata.append(
            self.make_element('dc:date', document, {'id': 'pub-date'},
                              simple_date))
        #Must have meta with dcterms:modified
        now = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
        metadata.append(
            self.make_element('meta', document,
                              {'property': 'dcterms:modified'}, now))
        #Metadata: Dates Refinement
        #values are dateAccepted, dateCopyrighted, dateSubmitted
        accepted = self.make_element('meta', document, {
            'refines': '#pub-date',
            'property': 'dcterms:dateAccepted'
        })
        copyrighted = self.make_element('meta', document, {
            'refines': '#pub-date',
            'property': 'dcterms:dateCopyrighted'
        })
        submitted = self.make_element('meta', document, {
            'refines': '#pub-date',
            'property': 'dcterms:dateSubmitted'
        })

        def date_text(date):
            text = str(date.year)
            if date.month:
                text = '-'.join([text, str(date.month)])
                if date.day:
                    text = '-'.join([text, str(date.day)])
            return text

        for date in self.dates:
            if date.event == 'accepted':
                accepted.text = date_text(date)
                metadata.append(accepted)
            elif date.event == 'copyrighted':
                copyrighted.text = date_text(date)
                metadata.append(copyrighted)
            elif date.event == 'submitted':
                submitted.text = date_text(date)
                metadata.append(submitted)

        #Metadata: Rights
        if self.collection:
            if len(self.rights) == 1:  # Only one license string present
                rights_text = '''\
All articles in this collection published according to the following license:
'''
                rights_text = ''.join([rights_text, self.rights.pop()])
            else:  # More than one, we need to refer to rights_associations
                rights_text = '''\
Articles in this collection were published according to different licenses. Each
unique license will be listed below, preceded by every article DOI to which it
applies.'''
                for lic, doi_list in list(self.rights_associations.items()):
                    doi_line = ','.join(doi_list)
                    rights_text = '\n'.join([rights_text, doi_line, lic])
            metadata.append(
                self.make_element('dc:rights', document, text=rights_text))

        else:
            metadata.append(
                self.make_element('dc:rights',
                                  document,
                                  text=self.rights.pop()))

        #Not Implemented Metadata: Source, Type, Coverage, Relation

        #Make the Manifest
        manifest = etree.SubElement(package, 'manifest')
        for item in self.file_manifest(os.path.join(location, 'EPUB')):
            if item.attrib['id'] == 'nav-xhtml':
                item.attrib['id'] = 'htmltoc'  # Special id for nav.xhtml
                item.attrib['properties'] = 'nav'
            if item.attrib['id'] == 'toc-ncx':
                item.attrib['id'] = 'ncx'  # Special id for toc.ncx
            manifest.append(item)

        #Make the Spine
        spine = etree.SubElement(package, 'spine')
        for item in self.spine_list:
            itemref = etree.SubElement(spine, 'itemref')
            itemref.attrib['idref'] = item.idref
            itemref.attrib['linear'] = 'yes' if item.linear else 'no'

        with open(os.path.join(location, 'EPUB', 'package.opf'),
                  'wb') as output:
            output.write(
                etree.tostring(document, encoding='utf-8', pretty_print=True))
예제 #7
0
class Navigation(object):

    def __init__(self, collection=False, title=''):
        self.collection = collection

        #Special navigation structures: List of Equations/Figures/Tables
        self.equations_list = []
        self.figures_list = []
        self.tables_list = []

        self.article = None
        self.article_doi = None
        self.all_dois = []  # Used to create UID

        #These are the limited forms of metadata that might make it in to the
        #navigation document. Both are used for EPUB2, only the title is used
        #for EPUB3
        self.title = title
        self.contributors = OrderedSet()

        #The nav structure is a list of navpoint trees. Each navpoint may have
        #children navpoints. This structure will be converted to the appropriate
        #xml/xhtml structure and written to file when required.
        self.nav = []
        self.nav_depth = 0

        self._play_order = 0
        self._auto_id = 0

    def process(self, article):
        """
        Ingests an Article to create navigation structures and parse global
        metadata.
        """
        if self.article is not None and not self.collection:
            log.warning('Could not process additional article. Navigation only \
handles one article unless collection mode is set.')
            return False

        if article.publisher is None:
            log.error('''Navigation cannot be generated for an Article \
without a publisher!''')
            return
        self.article = article
        self.article_doi = self.article.doi.split('/')[1]
        self.all_dois.append(self.article.doi)
        if self.collection:
            pass
        else:
            self.title = self.article.publisher.nav_title()
        for author in self.article.publisher.nav_contributors():
            self.contributors.add(author)

        #Analyze the structure of the article to create internal mapping
        self.map_navigation()

    def map_navigation(self):
        """
        This is a wrapper for depth-first recursive analysis of the article
        """
        #All articles should have titles
        title_id = 'titlepage-{0}'.format(self.article_doi)
        title_label = self.article.publisher.nav_title()
        title_source = 'main.{0}.xhtml#title'.format(self.article_doi)
        title_navpoint = navpoint(title_id, title_label, self.play_order,
                                  title_source, [])
        self.nav.append(title_navpoint)
        #When processing a collection of articles, we will want all subsequent
        #navpoints for this article to be located under the title
        if self.collection:
            nav_insertion = title_navpoint.children
        else:
            nav_insertion = self.nav

        #If the article has a body, we'll need to parse it for navigation
        if self.article.body is not None:
            #Here is where we invoke the recursive parsing!
            for nav_pt in self.recursive_article_navmap(self.article.body):
                nav_insertion.append(nav_pt)

        #Add a navpoint to the references if appropriate
        if self.article.root.xpath('./back/ref'):
            ref_id = 'references-{0}'.format(self.article_doi)
            ref_label = 'References'
            ref_source = 'biblio.{0}.xhtml#references'.format(self.article_doi)
            ref_navpoint = navpoint(ref_id, ref_label, self.play_order,
                                    ref_source, [])
            nav_insertion.append(ref_navpoint)

    def recursive_article_navmap(self, src_element, depth=0, first=True):
        """
        This function recursively traverses the content of an input article to
        add the correct elements to the NCX file's navMap and Lists.
        """
        if depth > self.nav_depth:
            self.nav_depth = depth
        navpoints = []
        tagnames = ['sec', 'fig', 'table-wrap']
        for child in src_element:
            try:
                tagname = child.tag
            except AttributeError:
                continue
            else:
                if tagname not in tagnames:
                    continue

            #Safely handle missing id attributes
            if 'id' not in child.attrib:
                child.attrib['id'] = self.auto_id

            #If in collection mode, we'll prepend the article DOI to avoid
            #collisions
            if self.collection:
                child_id = '-'.join([self.article_doi,
                                     child.attrib['id']])
            else:
                child_id = child.attrib['id']

            #Attempt to infer the correct text as a label
            #Skip the element if we cannot
            child_title = child.find('title')
            if child_title is None:
                continue  # If there is no immediate title, skip this element
            label = element_methods.all_text(child_title)
            if not label:
                continue  # If no text in the title, skip this element
            source = 'main.{0}.xhtml#{1}'.format(self.article_doi,
                                               child.attrib['id'])
            if tagname == 'sec':
                children = self.recursive_article_navmap(child, depth=depth + 1)
                navpoints.append(navpoint(child_id,
                                          label,
                                          self.play_order,
                                          source,
                                          children))
            #figs and table-wraps do not have children
            elif tagname == 'fig':  # Add navpoints to list_of_figures
                self.figures_list.append(navpoint(child.attrib['id'],
                                                  label,
                                                  None,
                                                  source,
                                                  []))
            elif tagname == 'table-wrap':  # Add navpoints to list_of_tables
                self.tables_list.append(navpoint(child.attrib['id'],
                                                 label,
                                                 None,
                                                 source,
                                                 []))
        return navpoints

    def render_EPUB2(self, location):
        """
        Creates the NCX specified file for EPUB2
        """

        def make_navlabel(text):
            """
            Creates and returns a navLabel element with the supplied text.
            """
            navlabel = etree.Element('navLabel')
            navlabel_text = etree.SubElement(navlabel, 'text')
            navlabel_text.text = text
            return navlabel

        def make_navMap(nav=None):
            if nav is None:
                nav_element = etree.Element('navMap')
                for nav_point in self.nav:
                    nav_element.append(make_navMap(nav=nav_point))
            else:
                nav_element = etree.Element('navPoint')
                nav_element.attrib['id'] = nav.id
                nav_element.attrib['playOrder'] = nav.playOrder
                nav_element.append(make_navlabel(nav.label))
                content_element = etree.SubElement(nav_element, 'content')
                content_element.attrib['src'] = nav.source
                for child in nav.children:
                    nav_element.append(make_navMap(nav=child))
            return nav_element
        root = etree.XML('''\
<?xml version="1.0"?>\
<ncx version="2005-1" xmlns="http://www.daisy.org/z3986/2005/ncx/">\
<head>\
<meta name="dtb:uid" content="{uid}"/>\
<meta name="dtb:depth" content="{depth}"/>\
<meta name="dtb:totalPageCount" content="0"/>\
<meta name="dtb:maxPageNumber" content="0"/>\
<meta name="dtb:generator" content="OpenAccess_EPUB {version}"/>\
</head>\
</ncx>'''.format(**{'uid': ','.join(self.all_dois),
                    'depth': self.nav_depth,
                    'version': __version__}))
        document = etree.ElementTree(root)
        ncx = document.getroot()

        #Create the docTitle element
        doctitle = etree.SubElement(ncx, 'docTitle')
        doctitle_text = etree.SubElement(doctitle, 'text')
        doctitle_text.text = self.title

        #Create the docAuthor elements
        for contributor in self.contributors:
            if contributor.role == 'author':
                docauthor = etree.SubElement(ncx, 'docAuthor')
                docauthor_text = etree.SubElement(docauthor, 'text')
                docauthor_text.text = contributor.name

        #Create the navMap element
        ncx.append(make_navMap())

        if self.figures_list:
            navlist = etree.SubElement(ncx, 'navList')
            navlist.append(make_navlabel('List of Figures'))
            for nav_pt in self.figures_list:
                navtarget = etree.SubElement(navlist, 'navTarget')
                navtarget.attrib['id'] = nav_pt.id
                navtarget.append(self.make_navlabel(nav_pt.label))
                content = etree.SubElement(navtarget, 'content')
                content.attrib['src'] = nav_pt.source

        if self.tables_list:
            navlist = etree.SubElement(ncx, 'navList')
            navlist.append(make_navlabel('List of Tables'))
            for nav_pt in self.tables_list:
                navtarget = etree.SubElement(navlist, 'navTarget')
                navtarget.attrib['id'] = nav_pt.id
                navtarget.append(self.make_navlabel(nav_pt.label))
                content = etree.SubElement(navtarget, 'content')
                content.attrib['src'] = nav_pt.source

        with open(os.path.join(location, 'EPUB', 'toc.ncx'), 'wb') as output:
            output.write(etree.tostring(document, encoding='utf-8', pretty_print=True))

    def render_EPUB3(self, location):
        def make_nav(nav=None):
            if nav is None:
                nav_element = etree.Element('ol')
                for nav_point in self.nav:
                    nav_element.append(make_nav(nav=nav_point))
            else:
                nav_element = etree.Element('li')
                a = etree.SubElement(nav_element, 'a')
                a.attrib['href'] = nav.source
                a.text = nav.label
                if nav.children:
                    ol = etree.SubElement(nav_element, 'ol')
                    for child in nav.children:
                        ol.append(make_nav(nav=child))
            return nav_element

        root = etree.XML('''\
<?xml version="1.0"?>\
<!DOCTYPE html>\
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">\
<head>\
<link rel="stylesheet" type="text/css" href="css/default.css" />\
</head>\
</html>''')

        document = etree.ElementTree(root)
        html = document.getroot()
        title = etree.SubElement(html[0], 'title')
        title.text = self.title

        body = etree.SubElement(html, 'body')  # Create the body element
        #h1 = etree.SubElement(body, 'h1')
        #h1.text = self.title
        #Create the prinary nav element
        nav = etree.SubElement(body, 'nav')
        nav.attrib['{http://www.idpf.org/2007/ops}type'] = 'toc'
        nav.attrib['id'] = 'toc'

        #Create the title
        h2 = etree.SubElement(nav, 'h2')
        h2.text = 'Table of Contents'

        #Stuff
        nav.append(make_nav())

        if self.figures_list:
            nav = etree.SubElement(body, 'nav')
            h2 = etree.SubElement(nav, 'h2')
            h2.text = 'List of Figures'
            ol = etree.SubElement(nav, 'ol')
            for nav_pt in self.figures_list:
                li = etree.SubElement(ol, 'li')
                a = etree.SubElement(li, 'a')
                a.attrib['href'] = nav_pt.source
                a.text = nav_pt.label

        if self.tables_list:
            nav = etree.SubElement(body, 'nav')
            h2 = etree.SubElement(nav, 'h2')
            h2.text = 'List of Tables'
            ol = etree.SubElement(nav, 'ol')
            for nav_pt in self.figures_list:
                li = etree.SubElement(ol, 'li')
                a = etree.SubElement(li, 'a')
                a.attrib['href'] = nav_pt.source
                a.text = nav_pt.label

        with open(os.path.join(location, 'EPUB', 'nav.xhtml'), 'wb') as output:
            output.write(etree.tostring(document, encoding='utf-8', pretty_print=True))

    @property
    def play_order(self):
        self._play_order += 1
        return str(self._play_order)

    @property
    def auto_id(self):
        self._auto_id += 1
        id_gen = 'OAE-{0}'.format(self._auto_id)
        log.debug('Navigation element missing ID: assigned {0}'.format(id_gen))
        return id_gen
예제 #8
0
class Navigation(object):
    def __init__(self, collection=False, title=''):
        self.collection = collection

        #Special navigation structures: List of Equations/Figures/Tables
        self.equations_list = []
        self.figures_list = []
        self.tables_list = []

        self.article = None
        self.article_doi = None
        self.all_dois = []  # Used to create UID

        #These are the limited forms of metadata that might make it in to the
        #navigation document. Both are used for EPUB2, only the title is used
        #for EPUB3
        self.title = title
        self.contributors = OrderedSet()

        #The nav structure is a list of navpoint trees. Each navpoint may have
        #children navpoints. This structure will be converted to the appropriate
        #xml/xhtml structure and written to file when required.
        self.nav = []
        self.nav_depth = 0

        self._play_order = 0
        self._auto_id = 0

    def process(self, article):
        """
        Ingests an Article to create navigation structures and parse global
        metadata.
        """
        if self.article is not None and not self.collection:
            log.warning(
                'Could not process additional article. Navigation only \
handles one article unless collection mode is set.')
            return False

        if article.publisher is None:
            log.error('''Navigation cannot be generated for an Article \
without a publisher!''')
            return
        self.article = article
        self.article_doi = self.article.doi.split('/')[1]
        self.all_dois.append(self.article.doi)
        if self.collection:
            pass
        else:
            self.title = self.article.publisher.nav_title()
        for author in self.article.publisher.nav_contributors():
            self.contributors.add(author)

        #Analyze the structure of the article to create internal mapping
        self.map_navigation()

    def map_navigation(self):
        """
        This is a wrapper for depth-first recursive analysis of the article
        """
        #All articles should have titles
        title_id = 'titlepage-{0}'.format(self.article_doi)
        title_label = self.article.publisher.nav_title()
        title_source = 'main.{0}.xhtml#title'.format(self.article_doi)
        title_navpoint = navpoint(title_id, title_label, self.play_order,
                                  title_source, [])
        self.nav.append(title_navpoint)
        #When processing a collection of articles, we will want all subsequent
        #navpoints for this article to be located under the title
        if self.collection:
            nav_insertion = title_navpoint.children
        else:
            nav_insertion = self.nav

        #If the article has a body, we'll need to parse it for navigation
        if self.article.body is not None:
            #Here is where we invoke the recursive parsing!
            for nav_pt in self.recursive_article_navmap(self.article.body):
                nav_insertion.append(nav_pt)

        #Add a navpoint to the references if appropriate
        if self.article.root.xpath('./back/ref'):
            ref_id = 'references-{0}'.format(self.article_doi)
            ref_label = 'References'
            ref_source = 'biblio.{0}.xhtml#references'.format(self.article_doi)
            ref_navpoint = navpoint(ref_id, ref_label, self.play_order,
                                    ref_source, [])
            nav_insertion.append(ref_navpoint)

    def recursive_article_navmap(self, src_element, depth=0, first=True):
        """
        This function recursively traverses the content of an input article to
        add the correct elements to the NCX file's navMap and Lists.
        """
        if depth > self.nav_depth:
            self.nav_depth = depth
        navpoints = []
        tagnames = ['sec', 'fig', 'table-wrap']
        for child in src_element:
            try:
                tagname = child.tag
            except AttributeError:
                continue
            else:
                if tagname not in tagnames:
                    continue

            #Safely handle missing id attributes
            if 'id' not in child.attrib:
                child.attrib['id'] = self.auto_id

            #If in collection mode, we'll prepend the article DOI to avoid
            #collisions
            if self.collection:
                child_id = '-'.join([self.article_doi, child.attrib['id']])
            else:
                child_id = child.attrib['id']

            #Attempt to infer the correct text as a label
            #Skip the element if we cannot
            child_title = child.find('title')
            if child_title is None:
                continue  # If there is no immediate title, skip this element
            label = element_methods.all_text(child_title)
            if not label:
                continue  # If no text in the title, skip this element
            source = 'main.{0}.xhtml#{1}'.format(self.article_doi,
                                                 child.attrib['id'])
            if tagname == 'sec':
                children = self.recursive_article_navmap(child,
                                                         depth=depth + 1)
                navpoints.append(
                    navpoint(child_id, label, self.play_order, source,
                             children))
            #figs and table-wraps do not have children
            elif tagname == 'fig':  # Add navpoints to list_of_figures
                self.figures_list.append(
                    navpoint(child.attrib['id'], label, None, source, []))
            elif tagname == 'table-wrap':  # Add navpoints to list_of_tables
                self.tables_list.append(
                    navpoint(child.attrib['id'], label, None, source, []))
        return navpoints

    def render_EPUB2(self, location):
        """
        Creates the NCX specified file for EPUB2
        """
        def make_navlabel(text):
            """
            Creates and returns a navLabel element with the supplied text.
            """
            navlabel = etree.Element('navLabel')
            navlabel_text = etree.SubElement(navlabel, 'text')
            navlabel_text.text = text
            return navlabel

        def make_navMap(nav=None):
            if nav is None:
                nav_element = etree.Element('navMap')
                for nav_point in self.nav:
                    nav_element.append(make_navMap(nav=nav_point))
            else:
                nav_element = etree.Element('navPoint')
                nav_element.attrib['id'] = nav.id
                nav_element.attrib['playOrder'] = nav.playOrder
                nav_element.append(make_navlabel(nav.label))
                content_element = etree.SubElement(nav_element, 'content')
                content_element.attrib['src'] = nav.source
                for child in nav.children:
                    nav_element.append(make_navMap(nav=child))
            return nav_element

        root = etree.XML('''\
<?xml version="1.0"?>\
<ncx version="2005-1" xmlns="http://www.daisy.org/z3986/2005/ncx/">\
<head>\
<meta name="dtb:uid" content="{uid}"/>\
<meta name="dtb:depth" content="{depth}"/>\
<meta name="dtb:totalPageCount" content="0"/>\
<meta name="dtb:maxPageNumber" content="0"/>\
<meta name="dtb:generator" content="OpenAccess_EPUB {version}"/>\
</head>\
</ncx>'''.format(
            **{
                'uid': ','.join(self.all_dois),
                'depth': self.nav_depth,
                'version': __version__
            }))
        document = etree.ElementTree(root)
        ncx = document.getroot()

        #Create the docTitle element
        doctitle = etree.SubElement(ncx, 'docTitle')
        doctitle_text = etree.SubElement(doctitle, 'text')
        doctitle_text.text = self.title

        #Create the docAuthor elements
        for contributor in self.contributors:
            if contributor.role == 'author':
                docauthor = etree.SubElement(ncx, 'docAuthor')
                docauthor_text = etree.SubElement(docauthor, 'text')
                docauthor_text.text = contributor.name

        #Create the navMap element
        ncx.append(make_navMap())

        if self.figures_list:
            navlist = etree.SubElement(ncx, 'navList')
            navlist.append(make_navlabel('List of Figures'))
            for nav_pt in self.figures_list:
                navtarget = etree.SubElement(navlist, 'navTarget')
                navtarget.attrib['id'] = nav_pt.id
                navtarget.append(self.make_navlabel(nav_pt.label))
                content = etree.SubElement(navtarget, 'content')
                content.attrib['src'] = nav_pt.source

        if self.tables_list:
            navlist = etree.SubElement(ncx, 'navList')
            navlist.append(make_navlabel('List of Tables'))
            for nav_pt in self.tables_list:
                navtarget = etree.SubElement(navlist, 'navTarget')
                navtarget.attrib['id'] = nav_pt.id
                navtarget.append(self.make_navlabel(nav_pt.label))
                content = etree.SubElement(navtarget, 'content')
                content.attrib['src'] = nav_pt.source

        with open(os.path.join(location, 'EPUB', 'toc.ncx'), 'wb') as output:
            output.write(
                etree.tostring(document, encoding='utf-8', pretty_print=True))

    def render_EPUB3(self, location):
        def make_nav(nav=None):
            if nav is None:
                nav_element = etree.Element('ol')
                for nav_point in self.nav:
                    nav_element.append(make_nav(nav=nav_point))
            else:
                nav_element = etree.Element('li')
                a = etree.SubElement(nav_element, 'a')
                a.attrib['href'] = nav.source
                a.text = nav.label
                if nav.children:
                    ol = etree.SubElement(nav_element, 'ol')
                    for child in nav.children:
                        ol.append(make_nav(nav=child))
            return nav_element

        root = etree.XML('''\
<?xml version="1.0"?>\
<!DOCTYPE html>\
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">\
<head>\
<link rel="stylesheet" type="text/css" href="css/default.css" />\
</head>\
</html>''')

        document = etree.ElementTree(root)
        html = document.getroot()
        title = etree.SubElement(html[0], 'title')
        title.text = self.title

        body = etree.SubElement(html, 'body')  # Create the body element
        #h1 = etree.SubElement(body, 'h1')
        #h1.text = self.title
        #Create the prinary nav element
        nav = etree.SubElement(body, 'nav')
        nav.attrib['{http://www.idpf.org/2007/ops}type'] = 'toc'
        nav.attrib['id'] = 'toc'

        #Create the title
        h2 = etree.SubElement(nav, 'h2')
        h2.text = 'Table of Contents'

        #Stuff
        nav.append(make_nav())

        if self.figures_list:
            nav = etree.SubElement(body, 'nav')
            h2 = etree.SubElement(nav, 'h2')
            h2.text = 'List of Figures'
            ol = etree.SubElement(nav, 'ol')
            for nav_pt in self.figures_list:
                li = etree.SubElement(ol, 'li')
                a = etree.SubElement(li, 'a')
                a.attrib['href'] = nav_pt.source
                a.text = nav_pt.label

        if self.tables_list:
            nav = etree.SubElement(body, 'nav')
            h2 = etree.SubElement(nav, 'h2')
            h2.text = 'List of Tables'
            ol = etree.SubElement(nav, 'ol')
            for nav_pt in self.figures_list:
                li = etree.SubElement(ol, 'li')
                a = etree.SubElement(li, 'a')
                a.attrib['href'] = nav_pt.source
                a.text = nav_pt.label

        with open(os.path.join(location, 'EPUB', 'nav.xhtml'), 'wb') as output:
            output.write(
                etree.tostring(document, encoding='utf-8', pretty_print=True))

    @property
    def play_order(self):
        self._play_order += 1
        return str(self._play_order)

    @property
    def auto_id(self):
        self._auto_id += 1
        id_gen = 'OAE-{0}'.format(self._auto_id)
        log.debug('Navigation element missing ID: assigned {0}'.format(id_gen))
        return id_gen