Пример #1
0
Файл: epub.py Проект: hornc/epub
    def __init__(self, out_name, metadata, content_dir='OEBPS/'):
        self.content_dir = content_dir
        self.dt = datetime.now()
        self.z = zipfile.ZipFile(out_name, 'w')
        self.add('mimetype', 'application/epub+zip', deflate=False)

        self.book_id = common.get_metadata_tag_data(metadata, 'identifier')
        self.title = common.get_metadata_tag_data(metadata, 'title')
        if self.title is None:
            self.title = 'none'
        self.author = common.get_metadata_tag_data(metadata, 'creator')
        if self.author is None:
            self.author = 'none'

        tree_str = make_container_info(content_dir)
        self.add('META-INF/container.xml', tree_str)

        (self.opf, self.opf_manifest_el,
         self.opf_spine_el, self.opf_guide_el) = make_opf(metadata)
        
        (self.ncx, self.ncx_head_el,
         self.ncx_navmap_el) = make_ncx(self.book_id,
                                        self.title,
                                        self.author)
        self.ncx_pagelist_el = None

        self.navpoint_stack = [self.ncx_navmap_el]
        self.id_index = 1
        self.nav_number = 1
        self.depth = 0
        self.current_depth = 0

        self.el_stack = []
        self.el_len_total = 0
        self.max_el_len_total = 150000
        self.part_number = 0
        self.current_part = None

        # Add static extra files - style sheet, etc.
        for id, href, media_type in [('css', 'stylesheet.css', 'text/css')]:
            content_src = os.path.join(sys.path[0], 'epub_files', href)
            content_str = open(content_src, 'r').read()
            self.add_content(id, href, media_type, content_str)
Пример #2
0
def process_book(iabook, ebook, alt_booktext=None):
    aby_ns="{http://www.abbyy.com/FineReader_xml/FineReader6-schema-v1.xml}"
    scandata = iabook.get_scandata()
    aby_file = iabook.get_abbyy()

    scandata_ns = iabook.get_scandata_ns()
    bookData = iabook.get_bookdata()

    # some books no scanlog
#     scanLog = scandata.find(scandata_ns + 'scanLog')
#     if scanLog is None:
#         scanLog = scandata.scanLog

    contents = iabook.get_toc()
    metadata = iabook.get_metadata()
    title = common.get_metadata_tag_data(metadata, 'title')
    if title is None:
        title = ''
    author = common.get_metadata_tag_data(metadata, 'creator')
    if author is None:
        author = ''

    ebook.push_tag('frontmatter')
    ebook.add_tag('doctitle', title)
    # ebook.add_tag('covertitle', title)
    ebook.add_tag('docauthor', author)

    ebook.push_navpoint('level', 'h', 'Producer\'s Note')
    ebook.push_navpoint('level', 'h', 'About Internet Archive Daisy Books')
    ebook.add_tag('p', """This book was produced in DAISY format by the Internet Archive.  The
    book pages were scanned and converted to DAISY format
    automatically.  This process relies on optical character
    recognition, and is somewhat susceptible to errors.  These errors
    may include weird characters, non-words, and incorrect guesses at
    structure.  Page numbers and headers or footers may remain from
    the scanned page.  The Internet Archive is working to improve the
    scanning process and resulting books, but in the meantime, we hope
    that this book will be useful to you.
    """)
    ebook.pop_navpoint()
    ebook.push_navpoint('level', 'h', 'About this DAISY book')
    has_nav = False
    if iabook.has_pagenos():
        has_nav = True
        ebook.add_tag('p', "This book has page navigation.")
    if contents is not None:
        has_nav = True
        ebook.add_tag('p', "This book has chapter navigation.")
    if not has_nav:
        ebook.add_tag('p', "This book as paragraph navigation, "
                      "but is otherwise unstructured.")
    ebook.pop_navpoint()
    ebook.push_navpoint('level', 'h', 'About the Internet Archive')
    ebook.add_tag('p', """The Internet Archive was founded in 1996
    to build an Internet library
and to promote universal access to all knowledge.  The Archive's purposes
include offering permanent access for researchers, historians,
scholars, people with disabilities, and the general public to
historical collections that exist in digital format.  The Internet Archive
includes texts, audio, moving images, and software as well as archived
web pages, and provides specialized services for information access
for the blind and other persons with disabilities.
    """)
    ebook.pop_navpoint()
    ebook.pop_navpoint()
    
    ebook.pop_tag()
    ebook.push_tag('bodymatter')

#     ebook.push_navpoint('level', 'h', 'Start of book')
#     pushed_navpoint = True

    if contents is None:
        ebook.push_navpoint('level', 'h', 'Book')

    part_number = 0
    cover_number = 0
    pushed_navpoint = False
    context = etree.iterparse(aby_file,
                              tag=aby_ns+'page',
                              resolve_entities=False)
    found_title = False
    for page_scandata in iabook.get_scandata_pages(): #confirm title exists
        t = page_scandata.pageType.text
        if t == 'Title' or t == 'Title Page':
            found_title = True
            break
    # True if no title found, else False now, True later.
    before_title_page = found_title
    for i, (event, page) in enumerate(context):
        # wrap in try/finally to ensure page.clear() is called
        try:
            if alt_booktext is not None:
                ebook.add_tag('p', alt_booktext)
                break

            page_scandata = iabook.get_page_scandata(i)
            pageno = None
            if page_scandata is not None:
                pageno = page_scandata.find(scandata_ns + 'pageNumber')
                if pageno:
                    pageno = pageno.text
            if pageno:
                if contents is not None and pageno in contents:
                    if pushed_navpoint:
                        ebook.pop_navpoint()
                    ebook.push_navpoint('level', 'h', contents[pageno])
                    pushed_navpoint = True
                part_str = 'part' + str(part_number).zfill(4)
                ebook.add_pagetarget(pageno, pageno)


            def include_page(page_scandata):
                if page_scandata is None:
                    return False
                add = page_scandata.find(scandata_ns + 'addToAccessFormats')
                if add is None:
                    add = page_scandata.addToAccessFormats
                if add is not None and add.text == 'true':
                    return True
                else:
                    return False

            if not include_page(page_scandata):
                continue

            page_type = page_scandata.pageType.text.lower()
            if page_type == 'cover':
                pass

            elif page_type == 'title' or page_type == 'title page':
                before_title_page = False
                pass

            elif page_type == 'copyright':
                pass

            elif page_type == 'contents':
                pass

            elif page_type == 'normal':
                if before_title_page:
                    pass
                    # XXX consider skipping if blank + no words?
                    # make page image
    #                 (id, filename) = make_html_page_image(i, iabook, ebook)
                else:
                    first_par = True
                    saw_pageno_header_footer = False

                    for block in page:
                        if block.get('blockType') == 'Text':
                            pass
                        else:
                            pass
                        for el in block:
                            if el.tag == aby_ns+'region':
                                for rect in el:
                                    pass
                            elif el.tag == aby_ns+'text':
                                for par in el:
                                    # skip if its the first line and it could be a header
                                    if first_par and common.par_is_pageno_header_footer(par):
                                        saw_pageno_header_footer = True
                                        first_par = False
                                        continue
                                    first_par = False

                                    # skip if it's the last par and it could be a header
                                    if (not saw_pageno_header_footer
                                        and block == page[-1]
                                        and el == block[-1]
                                        and par == el[-1]
                                        and common.par_is_pageno_header_footer(par)):
                                        saw_pageno_header_footer = True
                                        continue

                                    lines = []
                                    prev_line = ''
                                    for line in par:
                                        for fmt in line:
                                            fmt_text = etree.tostring(fmt,
                                                                      method='text',
                                                                      encoding=unicode)
                                            if len(fmt_text) > 0:
                                                if prev_line[-1:] == '-':
                                                    if fmt[0].get('wordStart') == 'false':
                                                        # ? and wordFromDictionary = true ?
                                                        lines.append(prev_line[:-1])
                                                    else:
                                                        lines.append(prev_line)
                                                else:
                                                    lines.append(prev_line)
                                                    lines.append(' ')
                                                prev_line = fmt_text
                                    lines.append(prev_line)
                                    ebook.add_tag('p', ''.join(lines))
                            elif (el.tag == aby_ns+'row'):
                                pass
                            else:
                                print('unexpected tag type' + el.tag)
                                sys.exit(-1)
        finally:
            page.clear()

    if pushed_navpoint:
        ebook.pop_navpoint()

    if contents is None:
        ebook.pop_navpoint() #level1

    ebook.pop_tag()
    ebook.push_tag('rearmatter')
    ebook.push_tag('level1')
    ebook.add_tag('p', 'End of book')
    ebook.pop_tag()
    ebook.pop_tag()
Пример #3
0
def process_book(iabook, ebook):
    aby_ns="{http://www.abbyy.com/FineReader_xml/FineReader6-schema-v1.xml}"
    scandata = iabook.get_scandata()

    scandata_ns = iabook.get_scandata_ns()
    bookData = iabook.get_bookdata()
    
    aby_file = iabook.get_abbyy()

    # some books no scanlog
#     scanLog = scandata.find(scandata_ns + 'scanLog')
#     if scanLog is None:
#         scanLog = scandata.scanLog

    contents = iabook.get_toc()
    metadata = iabook.get_metadata()
    title = common.get_metadata_tag_data(metadata, 'title')
    if title is None:
        title = 'none'
    author = common.get_metadata_tag_data(metadata, 'creator')
    if author is None:
        author = 'none'

    cover_number = 0
    toc_item_number = 0
    picture_number = 0
    pushed_chapters = False
    made_contents_navpoint = False
    made_pages = False
    context = etree.iterparse(aby_file,
                              tag=aby_ns+'page',
                              resolve_entities=False)
    found_title = False
    for page_scandata in iabook.get_scandata_pages(): #confirm title exists
        try:
            t = page_scandata.pageType.text.lower()
        except AttributeError:
            t = 'normal'

        if t == 'title' or t == 'title page':
            found_title = True
            break
    # True if no title found, else False now, True later.
    before_title_page = found_title
    for i, (event, page) in enumerate(context):
        # wrap in try/finally to ensure page.clear() is called
        try:
            page_scandata = iabook.get_page_scandata(i)
            pageno = None
            if page_scandata is not None:
                pageno = page_scandata.find(scandata_ns + 'pageNumber')
                if pageno:
                    pageno = pageno.text
            if pageno:
                if contents is not None and pageno in contents:
                    ebook.flush_els()
                    if not pushed_chapters:
                        cdiv = E.div({ 'class':'newnav', 'id':'chapters' })
                        href = ebook.add_el(cdiv) + '#' + 'chapters'
                        ebook.push_navpoint('Chapters', href)
                        pushed_chapters = True
                    id = 'toc-' + str(toc_item_number)
                    toc_item_number += 1
                    cdiv = E.div({ 'class':'newnav', 'id':id })
                    href = ebook.add_el(cdiv) + '#' + id
                    ebook.add_navpoint(contents[pageno], href)

                id = 'page-' + pageno
                pdiv = E.div({ 'class':'newpage', 'id':id })
                href = ebook.add_el(pdiv) + '#' + id
                ebook.add_pagetarget(pageno, pageno, href)

            def include_page(page_scandata):
                if page_scandata is None:
                    return False
                add = page_scandata.find(scandata_ns + 'addToAccessFormats')
                if add is None:
                    add = page_scandata.addToAccessFormats
                if add is not None and add.text == 'true':
                    return True
                else:
                    return False

            if not include_page(page_scandata):
                continue

            try:
                page_type = page_scandata.pageType.text.lower()
            except AttributeError:
                page_type = 'normal'

            if page_type == 'cover':
                if cover_number == 0:
                    cover_title = 'Front Cover'
                    front_cover = True
                else:
                    cover_title = 'Back Cover' ## xxx detect back page?
                    front_cover = False
                    ebook.flush_els()
                    if pushed_chapters:
                        ebook.pop_navpoint()
                        pushed_chapters = False

                (id, filename) = make_html_page_image(i, iabook, ebook,
                                                      cover=front_cover)
                if id is not None:
                    ebook.add_navpoint(cover_title, filename)
                    if cover_number == 0:
                        ebook.add_guide_item({ 'href':filename,
                                               'type':'cover',
                                               'title':cover_title })

                        # Add intro page after 1rst cover page
                        tree = make_html('Archive',
                             [E.p('This book made available by the Internet Archive.')])
                        ebook.add_content('intro', 'intro.html',
                                          'application/xhtml+xml',
                                          common.tree_to_str(tree,
                                                             xml_declaration=False))
                        ebook.add_spine_item({ 'idref':'intro', 'linear':'no' })
                    cover_number += 1

            elif page_type == 'title' or page_type == 'title page':
                before_title_page = False
                (id, filename) = make_html_page_image(i, iabook, ebook)
                if id is not None:
                    ebook.add_navpoint('Title Page', filename)
                    ebook.add_guide_item({ 'href':filename,
                                           'type':'title-page',
                                           'title':'Title Page' })
            elif page_type == 'copyright':
                (id, filename) = make_html_page_image(i, iabook, ebook)
                if id is not None:
                    ebook.add_navpoint('Copyright', filename)
                    ebook.add_guide_item({ 'href':filename,
                                           'type':'copyright-page',
                                           'title':'Title Page' })
            elif page_type == 'contents':
                (id, filename) = make_html_page_image(i, iabook, ebook)
                if id is not None:
                    if not made_contents_navpoint:
                        ebook.add_navpoint('Table of Contents', filename)
                        made_contents_navpoint = True
                    ebook.add_guide_item({ 'href':filename,
                                           'type':'toc',
                                           'title':'Title Page' })

            elif page_type == 'normal':
                if before_title_page:
                    page_text = etree.tostring(page,
                                               method='text',
                                               encoding=unicode)
                    # Skip if not much text
                    if len(page_text) >= 10:
                        (id, filename) = make_html_page_image(i, iabook, ebook)
                    # XXX note that above might return None, None and do nothing...
                else:
                    first_par = True
                    saw_pageno_header_footer = False

                    for block in page:
                        if block.get('blockType') == 'Picture':
                            region = ((int(block.get('l')),
                                       int(block.get('t'))),
                                      (int(block.get('r')),
                                       int(block.get('b'))))
                            (l, t), (r, b) = region
                            region_width = r - l
                            region_height = b - t
                            orig_page_size = (int(page.get('width')),
                                         int(page.get('height')))
                            page_width, page_height = orig_page_size

                            # XXX bad aspect ratio!
                            # XXX need fixed code to get requested size
                            req_width = int(max_width *
                                            (region_width / float(page_width)))
                            req_height = int(max_height *
                                             (region_height / float(page_height)))
                            image = iabook.get_page_image(i,
                                                          (req_width, req_height),
                                                          orig_page_size,
                                                          kdu_reduce=2,
                                                          region=region)
                            if image is not None:
                                pic_id = 'picture' + str(picture_number)
                                pic_href = 'images/' + pic_id + '.jpg'
                                picture_number += 1
                                ebook.add_content(pic_id, pic_href,
                                                  'image/jpeg', image, deflate=False)
                                el = E.p({ 'class':'illus' },
                                         E.img(src=pic_href,
                                               alt=pic_id))
                                ebook.add_el(el)
                            continue
                        for el in block:
                            if el.tag == aby_ns+'region':
                                for rect in el:
                                    pass
                            elif el.tag == aby_ns+'text':
                                for par in el:
                                    # skip if its the first line and it could be a header
                                    if first_par and common.par_is_pageno_header_footer(par):
                                        saw_pageno_header_footer = True
                                        first_par = False
                                        continue
                                    first_par = False

                                    # skip if it's the last par and it could be a header
                                    if (not saw_pageno_header_footer
                                        and block == page[-1]
                                        and el == block[-1]
                                        and par == el[-1]
                                        and common.par_is_pageno_header_footer(par)):
                                        saw_pageno_header_footer = True
                                        continue

                                    lines = []
                                    prev_line = ''
                                    for line in par:
                                        for fmt in line:
                                            fmt_text = etree.tostring(fmt,
                                                                  method='text',
                                                                  encoding=unicode)
                                            if len(fmt_text) > 0:
                                                if prev_line[-1:] == '-':
                                                    if fmt[0].get('wordStart') == 'false':
                                                        # ? and wordFromDictionary = true ?
                                                        lines.append(prev_line[:-1])
                                                    else:
                                                        lines.append(prev_line)
                                                else:
                                                    lines.append(prev_line)
                                                    lines.append(' ')
                                                prev_line = fmt_text
                                    lines.append(prev_line)

                                    if not made_pages:
                                        made_pages = True
                                        if not contents:
                                            href = ebook.add_el(E.div({ 'class':'pages', 'id':'pages' }))
                                            ebook.add_navpoint('Pages', href)
                                    to_add = ''.join(lines)
                                    ebook.add_el(E.p(to_add), len(to_add))
                            elif (el.tag == aby_ns+'row'):
                                pass
                            else:
                                print('unexpected tag type' + el.tag)
                                sys.exit(-1)
        finally:
            page.clear()

    ebook.flush_els()
    if pushed_chapters:
        ebook.pop_navpoint()
Пример #4
0
    def __init__(self, out_name, metadata, content_dir=''):
        self.dt = datetime.now()
        self.z = zipfile.ZipFile(out_name, 'w')
        self.content_dir = content_dir
        self.book_id = common.get_metadata_tag_data(metadata, 'identifier')
        self.title = common.get_metadata_tag_data(metadata, 'title')
        self.author = common.get_metadata_tag_data(metadata, 'creator')
        self.nav_number = 1

        self.opf_file = self.book_id + '_daisy.opf'

        self.dtbook_file = self.book_id + '_daisy.xml'
        self.dtbook, self.dtbook_book_el = make_dtbook(self.book_id, self.title)

        self.smil_file = self.book_id + '_daisy.smil'
        self.smil, self.smil_seq_el = make_smil(self.book_id)

        self.ncx_file = self.book_id + '_daisy.ncx'
        self.ncx, self.ncx_head_el, self.ncx_navmap_el, self.ncx_pagelist_el = make_ncx(
            self.book_id, self.title, self.author)

        self.tag_stack = [self.dtbook_book_el]
        self.navpoint_stack = [self.ncx_navmap_el]

        self.id_index = 1

        self.depth = 0
        self.current_depth = 0
        self.total_page_count = 0
        self.max_page_number = 0

        # style sheet, etc.
        for content in ['daisy.css', 'daisyTransform.xsl',
                        'dtbook-2005-3.dtd', 'html.css',
                        'resource.res']:
            content_src = os.path.join(sys.path[0], 'daisy_files', content)
            content_str = open(content_src, 'r').read()
            self.add(self.content_dir + content, content_str)

        self.manifest_items = [
            { 'id':'xml',
              'href':self.dtbook_file,
              'media-type':'application/x-dtbook+xml'
              },
            { 'id':'opf',
              'href':self.book_id + '_daisy.opf',
              'media-type':'text/xml'
              },
            { 'id':'ncx',
              'href':self.ncx_file,
              'media-type':'application/x-dtbncx+xml'
              },
            { 'id':'smil',
              'href':self.smil_file,
              'media-type':'application/smil'
              },
            { 'id':'daisyTransform',
              'href':'daisyTransform.xsl',
              'media-type':'text/xsl'
              },
            { 'id':'daisyCss',
              'href':'daisy.css',
              'media-type':'text/css'
              },
            { 'id':'htmlCss',
              'href':'html.css',
              'media-type':'text/css'
              },
            { 'id':'resource',
              'href':'resource.res',
              'media-type':'application/x-dtbresource+xml'
              },
            ]