def finish(self, metadata): # ... Any remaining html? tree_str = common.tree_to_str(self.ncx) self.add_content('ncx', 'toc.ncx', 'application/x-dtbncx+xml', tree_str) tree_str = common.tree_to_str(self.opf) self.add(self.content_dir + 'content.opf', tree_str) self.z.close()
def make_opf(meta_info_items, manifest_items, spine_items, guide_items, include_page_map, cover_id=None): root = etree.Element('package', { 'xmlns' : 'http://www.idpf.org/2007/opf', 'unique-identifier' : 'bookid', 'version' : '2.0' }, nsmap={'dc' : dc }) metadata = etree.SubElement(root, 'metadata') for item in meta_info_items: el = etree.SubElement(metadata, item['item'], item.get('atts')) if 'text' in item: el.text = item['text'] manifest = etree.SubElement(root, 'manifest') for item in manifest_items: etree.SubElement(manifest, 'item', item) # if cover_id is not None: # etree.SubElement(manifest, 'meta', name='cover', # content=cover_id) if len(spine_items) > 0: spine_attrs = { 'toc':'ncx' } if include_page_map: spine_attrs['page-map'] = 'page-map' spine = etree.SubElement(root, 'spine', spine_attrs) for item in spine_items: etree.SubElement(spine, 'itemref', item) if len(guide_items) > 0: guide = etree.SubElement(root, 'guide') for item in guide_items: etree.SubElement(guide, 'reference', item) return common.tree_to_str(root)
def make_opf(metadata, manifest_items): xml = """<?xml version='1.0' encoding='utf-8'?> <!DOCTYPE package PUBLIC "+//ISBN 0-9673008-1-9//DTD OEB 1.2 Package//EN" "http://openebook.org/dtds/oeb-1.2/oebpkg12.dtd"> <package xmlns="http://openebook.org/namespaces/oeb-package/1.0/" unique-identifier="bookid"/> """ tree = etree.parse(StringIO(xml)) root_el = tree.getroot() metadata_el = etree.SubElement(root_el, 'metadata') dc_metadata_el = etree.SubElement(metadata_el, 'dc-metadata', nsmap={ 'dc':dc, 'oebpackage':'http://openebook.org/namespaces/oeb-package/1.0/' }) el = etree.SubElement(dc_metadata_el, dcb + 'Format') el.text = 'ANSI/NISO Z39.86-2005' # TODO: ensure required elements: (copy code from epub.py) # title # publisher # date # format (must be 'ANSI/NISO Z39.86-2005') # language # identifier for md in metadata: tagname = md['tag'] if not tagname in [ 'title', 'creator', 'subject', 'description', 'publisher', 'contributor', 'date', 'type', 'format', 'identifier', 'source', 'language', 'relation','coverage', 'rights' ]: continue dctag = dcb + tagname[:1].upper() + tagname[1:] if tagname == 'identifier': el = etree.SubElement(dc_metadata_el, dctag, { 'id':'bookid' }) # el.text = md['text'] + xtra el.text = md['text'] else: el = etree.SubElement(dc_metadata_el, dctag) el.text = md['text'] x_metadata_el = etree.SubElement(metadata_el, 'x-metadata') el = etree.SubElement(x_metadata_el, 'meta', { 'name':'dtb:multimediaType', 'content':'textNCX' }) el = etree.SubElement(x_metadata_el, 'meta', { 'name':'dtb:multimediaContent', 'content':'text' }) el = etree.SubElement(x_metadata_el, 'meta', { 'name':'dtb:totalTime', 'content':'0' }) manifest_el = etree.SubElement(root_el, 'manifest') for item in manifest_items: etree.SubElement(manifest_el, 'item', item) spine_el = etree.SubElement(root_el, 'spine') etree.SubElement(spine_el, 'itemref', { 'idref':'smil' }) tree = etree.ElementTree(root_el) return common.tree_to_str(tree)
def make_container_info(content_dir='OEBPS/'): root = etree.Element('container', version='1.0', xmlns='urn:oasis:names:tc:opendocument:xmlns:container') rootfiles = etree.SubElement(root, 'rootfiles') etree.SubElement(rootfiles, 'rootfile', { 'full-path' : content_dir + 'content.opf', 'media-type' : 'application/oebps-package+xml' } ) return common.tree_to_str(root)
def finish(self, metadata): tree_str = make_opf(metadata, self.manifest_items) self.add(self.content_dir + self.opf_file, tree_str) metas = [ { 'name':'dtb:depth', 'content':str(self.depth) }, { 'name':'dtb:totalPageCount', 'content':str(self.total_page_count) }, { 'name':'dtb:maxPageNumber', 'content':str(self.max_page_number) }, ] for item in metas: etree.SubElement(self.ncx_head_el, 'meta', item) tree_str = common.tree_to_str(self.ncx) self.add(self.content_dir + self.ncx_file, tree_str) tree_str = common.tree_to_str(self.dtbook) self.add(self.content_dir + self.dtbook_file, tree_str) tree_str = common.tree_to_str(self.smil) self.add(self.content_dir + self.smil_file, tree_str) self.z.close()
def make_ncx(navpoints, page_items): import StringIO xml = """<?xml version='1.0' encoding='utf-8'?> <!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd"> <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1"/> """ tree = etree.parse(StringIO.StringIO(xml)) root = tree.getroot() head = etree.SubElement(root, 'head') metas = [ { 'name' : 'dtb:uid', 'content' : 'test id' }, { 'name' : 'dtb:depth', 'content' : '1' }, { 'name' : 'dtb:totalPageCount', 'content' : '0' }, { 'name' : 'dtb:maxPageNumber', 'content' : '0' }, ] for item in metas: etree.SubElement(head, 'meta', item) doctitle = etree.SubElement(root, 'docTitle') etree.SubElement(doctitle, 'text').text = 'Hello World'; # navMap element navmap = etree.SubElement(root, 'navMap') for item in navpoints: navpoint = etree.SubElement(navmap, 'navPoint', { 'id':'navpoint-' + str(item['playOrder']), 'playOrder':str(item['playOrder']) }) navlabel = etree.SubElement(navpoint, 'navLabel') etree.SubElement(navlabel, 'text').text = item['text'] # XXX 'content' should be 'href' etree.SubElement(navpoint, 'content', src=item['content']) # pageList element if len(page_items) > 0: pagelist = etree.SubElement(root, 'pageList', { 'id':'page-mapping', 'class':'pagelist' }) navlabel = etree.SubElement(pagelist, 'navLabel') text = etree.SubElement(navlabel, 'text') text.text = 'Pages' for item in page_items: id = 'page-' + item['name'] pagetarget = etree.SubElement(pagelist, 'pageTarget', { 'id':id, 'value':str(item['value']), 'type':item['type'], 'playOrder':item['playOrder'] }) navlabel = etree.SubElement(pageTarget, 'navLabel') etree.SubElement(navlabel, 'text').text = 'Page ' + item['name'] etree.SubElement(pagetarget, 'content', src=item['href']) tree = etree.ElementTree(root) return common.tree_to_str(tree)
def flush_els(self): if self.current_part is None: return part_str = 'part' + str(self.part_number).zfill(4) part_str_href = part_str + '.html' self.add_content(part_str, part_str_href, 'application/xhtml+xml', common.tree_to_str(self.current_part, xml_declaration=False)) self.add_spine_item({ 'idref':part_str }) if self.part_number == 0: self.add_guide_item({ 'href':part_str_href, 'type':'text', 'title':'Book' }) self.part_number += 1 self.el_stack = [] # xxx ? require popped? self.el_len_total = 0 self.current_part = None
def make_html_page_image(i, iabook, ebook): image = iabook.get_page_image(i, width=600, height=800, quality=90) leaf_id = 'leaf' + str(i).zfill(4) leaf_image_id = 'leaf-image' + str(i).zfill(4) ebook.add_content({ 'id':leaf_image_id, 'href':'images/' + leaf_image_id + '.jpg', 'media-type':'image/jpeg' }, image); img_tag = E.img({ 'src':'images/' + leaf_image_id + '.jpg', 'alt':'leaf ' + str(i) }) tree = make_html('leaf ' + str(i).zfill(4), [ img_tag ]) ebook.add_content({ 'id':leaf_id, 'href':leaf_id + '.html', 'media-type':'application/xhtml+xml' }, common.tree_to_str(tree, xml_declaration=False)) ebook.add_spine_item({ 'idref':leaf_id, 'linear':'no' }) return leaf_id, leaf_id + '.html'
def make_html_page_image(i, iabook, ebook, cover=False): ebook.flush_els() image = iabook.get_page_image(i, (max_width, max_height)) if image is None: return None, None leaf_id = 'leaf' + str(i).zfill(4) if not cover: leaf_image_id = 'leaf-image' + str(i).zfill(4) else: leaf_image_id = 'cover-image' ebook.add_content(leaf_image_id, 'images/' + leaf_image_id + '.jpg', 'image/jpeg', image, deflate=False) img_tag = E.img({ 'src':'images/' + leaf_image_id + '.jpg', 'alt':'leaf ' + str(i) }) tree = make_html('leaf ' + str(i).zfill(4), [ img_tag ]) ebook.add_content(leaf_id, leaf_id + '.html', 'application/xhtml+xml', common.tree_to_str(tree, xml_declaration=False)) ebook.add_spine_item({ 'idref':leaf_id, 'linear':'no' }) return leaf_id, leaf_id + '.html'
def make_page_map(page_items): root = etree.Element('page-map', xmlns='http://www.idpf.org/2007/opf') for item in page_map_items: etree.SubElement(root, 'page', name=item['name'], href=item['href']) return common.tree_to_str(root)
def process_book(iabook, ebook): aby_ns="{http://www.abbyy.com/FineReader_xml/FineReader6-schema-v1.xml}" scandata = iabook.get_scandata() scandata_ns = iabook.get_scandata_ns() bookData = iabook.get_bookdata() aby_file = iabook.get_abbyy() # some books no scanlog # scanLog = scandata.find(scandata_ns + 'scanLog') # if scanLog is None: # scanLog = scandata.scanLog contents = iabook.get_toc() metadata = iabook.get_metadata() title = common.get_metadata_tag_data(metadata, 'title') if title is None: title = 'none' author = common.get_metadata_tag_data(metadata, 'creator') if author is None: author = 'none' cover_number = 0 toc_item_number = 0 picture_number = 0 pushed_chapters = False made_contents_navpoint = False made_pages = False context = etree.iterparse(aby_file, tag=aby_ns+'page', resolve_entities=False) found_title = False for page_scandata in iabook.get_scandata_pages(): #confirm title exists try: t = page_scandata.pageType.text.lower() except AttributeError: t = 'normal' if t == 'title' or t == 'title page': found_title = True break # True if no title found, else False now, True later. before_title_page = found_title for i, (event, page) in enumerate(context): # wrap in try/finally to ensure page.clear() is called try: page_scandata = iabook.get_page_scandata(i) pageno = None if page_scandata is not None: pageno = page_scandata.find(scandata_ns + 'pageNumber') if pageno: pageno = pageno.text if pageno: if contents is not None and pageno in contents: ebook.flush_els() if not pushed_chapters: cdiv = E.div({ 'class':'newnav', 'id':'chapters' }) href = ebook.add_el(cdiv) + '#' + 'chapters' ebook.push_navpoint('Chapters', href) pushed_chapters = True id = 'toc-' + str(toc_item_number) toc_item_number += 1 cdiv = E.div({ 'class':'newnav', 'id':id }) href = ebook.add_el(cdiv) + '#' + id ebook.add_navpoint(contents[pageno], href) id = 'page-' + pageno pdiv = E.div({ 'class':'newpage', 'id':id }) href = ebook.add_el(pdiv) + '#' + id ebook.add_pagetarget(pageno, pageno, href) def include_page(page_scandata): if page_scandata is None: return False add = page_scandata.find(scandata_ns + 'addToAccessFormats') if add is None: add = page_scandata.addToAccessFormats if add is not None and add.text == 'true': return True else: return False if not include_page(page_scandata): continue try: page_type = page_scandata.pageType.text.lower() except AttributeError: page_type = 'normal' if page_type == 'cover': if cover_number == 0: cover_title = 'Front Cover' front_cover = True else: cover_title = 'Back Cover' ## xxx detect back page? front_cover = False ebook.flush_els() if pushed_chapters: ebook.pop_navpoint() pushed_chapters = False (id, filename) = make_html_page_image(i, iabook, ebook, cover=front_cover) if id is not None: ebook.add_navpoint(cover_title, filename) if cover_number == 0: ebook.add_guide_item({ 'href':filename, 'type':'cover', 'title':cover_title }) # Add intro page after 1rst cover page tree = make_html('Archive', [E.p('This book made available by the Internet Archive.')]) ebook.add_content('intro', 'intro.html', 'application/xhtml+xml', common.tree_to_str(tree, xml_declaration=False)) ebook.add_spine_item({ 'idref':'intro', 'linear':'no' }) cover_number += 1 elif page_type == 'title' or page_type == 'title page': before_title_page = False (id, filename) = make_html_page_image(i, iabook, ebook) if id is not None: ebook.add_navpoint('Title Page', filename) ebook.add_guide_item({ 'href':filename, 'type':'title-page', 'title':'Title Page' }) elif page_type == 'copyright': (id, filename) = make_html_page_image(i, iabook, ebook) if id is not None: ebook.add_navpoint('Copyright', filename) ebook.add_guide_item({ 'href':filename, 'type':'copyright-page', 'title':'Title Page' }) elif page_type == 'contents': (id, filename) = make_html_page_image(i, iabook, ebook) if id is not None: if not made_contents_navpoint: ebook.add_navpoint('Table of Contents', filename) made_contents_navpoint = True ebook.add_guide_item({ 'href':filename, 'type':'toc', 'title':'Title Page' }) elif page_type == 'normal': if before_title_page: page_text = etree.tostring(page, method='text', encoding=unicode) # Skip if not much text if len(page_text) >= 10: (id, filename) = make_html_page_image(i, iabook, ebook) # XXX note that above might return None, None and do nothing... else: first_par = True saw_pageno_header_footer = False for block in page: if block.get('blockType') == 'Picture': region = ((int(block.get('l')), int(block.get('t'))), (int(block.get('r')), int(block.get('b')))) (l, t), (r, b) = region region_width = r - l region_height = b - t orig_page_size = (int(page.get('width')), int(page.get('height'))) page_width, page_height = orig_page_size # XXX bad aspect ratio! # XXX need fixed code to get requested size req_width = int(max_width * (region_width / float(page_width))) req_height = int(max_height * (region_height / float(page_height))) image = iabook.get_page_image(i, (req_width, req_height), orig_page_size, kdu_reduce=2, region=region) if image is not None: pic_id = 'picture' + str(picture_number) pic_href = 'images/' + pic_id + '.jpg' picture_number += 1 ebook.add_content(pic_id, pic_href, 'image/jpeg', image, deflate=False) el = E.p({ 'class':'illus' }, E.img(src=pic_href, alt=pic_id)) ebook.add_el(el) continue for el in block: if el.tag == aby_ns+'region': for rect in el: pass elif el.tag == aby_ns+'text': for par in el: # skip if its the first line and it could be a header if first_par and common.par_is_pageno_header_footer(par): saw_pageno_header_footer = True first_par = False continue first_par = False # skip if it's the last par and it could be a header if (not saw_pageno_header_footer and block == page[-1] and el == block[-1] and par == el[-1] and common.par_is_pageno_header_footer(par)): saw_pageno_header_footer = True continue lines = [] prev_line = '' for line in par: for fmt in line: fmt_text = etree.tostring(fmt, method='text', encoding=unicode) if len(fmt_text) > 0: if prev_line[-1:] == '-': if fmt[0].get('wordStart') == 'false': # ? and wordFromDictionary = true ? lines.append(prev_line[:-1]) else: lines.append(prev_line) else: lines.append(prev_line) lines.append(' ') prev_line = fmt_text lines.append(prev_line) if not made_pages: made_pages = True if not contents: href = ebook.add_el(E.div({ 'class':'pages', 'id':'pages' })) ebook.add_navpoint('Pages', href) to_add = ''.join(lines) ebook.add_el(E.p(to_add), len(to_add)) elif (el.tag == aby_ns+'row'): pass else: print('unexpected tag type' + el.tag) sys.exit(-1) finally: page.clear() ebook.flush_els() if pushed_chapters: ebook.pop_navpoint()
def process_book(iabook, ebook): aby_ns="{http://www.abbyy.com/FineReader_xml/FineReader6-schema-v1.xml}" scandata = iabook.get_scandata() metadata = objectify.parse(iabook.get_metadata_path()).getroot() aby_file = iabook.get_abbyy() bookData = scandata.find('bookData') # XXX should fix below and similar by ensuring that scandata is always the same fmt... # scandata.zip/scandata.xml parses different? if bookData is None: bookData = scandata.bookData # some books no scanlog # scanLog = scandata.find('scanLog') # if scanLog is None: # scanLog = scandata.scanLog paragraphs = [] i = 0 part_number = 0 cover_number = 0 nav_number = 0 context = etree.iterparse(aby_file, tag=aby_ns+'page', resolve_entities=False) found_title = False for page_scandata in iabook.get_scandata_pages(): #confirm title exists t = page_scandata.pageType.text if t == 'Title' or t == 'Title Page': found_title = True break # True if no title found, else False now, True later. before_title_page = found_title for event, page in context: page_scandata = iabook.get_page_scandata(i) def include_page(page_scandata): if page_scandata is None: return False add = page_scandata.find('addToAccessFormats') if add is None: add = page_scandata.addToAccessFormats if add is not None and add.text == 'true': return True else: return False if not include_page(page_scandata): i += 1 continue page_type = page_scandata.pageType.text.lower() if page_type == 'cover': (id, filename) = make_html_page_image(i, iabook, ebook) if cover_number == 0: cover_title = 'Front Cover' else: cover_title = 'Back Cover' ## xxx detect back page? ebook.add_navpoint( { 'text':cover_title, 'content':filename } ) if cover_number == 0: ebook.add_guide_item( { 'href':filename, 'type':'cover', 'title':cover_title } ) ebook.add_cover_id(id) # Add intro page after 1rst cover page tree = make_html('Archive', [E.p('This book made available by the Internet Archive.')]) ebook.add_content({ 'id':'intro', 'href':'intro.html', 'media-type':'application/xhtml+xml' }, common.tree_to_str(tree, xml_declaration=False)) ebook.add_spine_item({ 'idref':'intro' }) cover_number += 1 elif page_type == 'title' or page_type == 'title page': before_title_page = False (id, filename) = make_html_page_image(i, iabook, ebook) ebook.add_navpoint( { 'text':'Title Page', 'content':filename } ) ebook.add_guide_item( { 'href':filename, 'type':'title-page', 'title':'Title Page' } ) elif page_type == 'copyright': (id, filename) = make_html_page_image(i, iabook, ebook) ebook.add_navpoint( { 'text':'Copyright', 'content':filename } ) ebook.add_guide_item( { 'href':filename, 'type':'copyright-page', 'title':'Title Page' } ) elif page_type == 'contents': (id, filename) = make_html_page_image(i, iabook, ebook) ebook.add_navpoint( { 'text':'Contents', 'content':filename } ) ebook.add_guide_item( { 'href':filename, 'type':'toc', 'title':'Title Page' } ) elif page_type == 'normal': # if i == 10: # debug() if before_title_page: # XXX consider skipping if blank + no words? # make page image (id, filename) = make_html_page_image(i, iabook, ebook) else: first_par = True for block in page: if block.get('blockType') == 'Text': pass else: pass for el in block: if el.tag == aby_ns+'region': for rect in el: pass elif el.tag == aby_ns+'text': for par in el: def par_is_header(par): # if: # it's the first on the page # there's only one line # on that line, there's a formatting tag, s.t. # - it has < 6 charParam kids # - each is wordNumeric # then: # Skip it! if len(par) != 1: return False line = par[0] for fmt in line: if len(fmt) > 6: continue saw_non_num = False for cp in fmt: if cp.get('wordNumeric') != 'true': saw_non_num = True break if not saw_non_num: return True hdr_text = etree.tostring(fmt, method='text', encoding=unicode) rnums = ['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx', 'xxi', 'xxii', ] if hdr_text in rnums: return True return False if first_par and par_is_header(par): first_par = False continue first_par = False lines = [] prev_line = '' for line in par: for fmt in line: fmt_text = etree.tostring(fmt, method='text', encoding=unicode) if len(fmt_text) > 0: if prev_line[-1:] == '-': if fmt[0].get('wordStart') == 'false': # ? and wordFromDictionary = true ? lines.append(prev_line[:-1]) else: lines.append(prev_line) else: lines.append(prev_line) lines.append(' ') prev_line = fmt_text lines.append(prev_line) paragraphs.append(E.p(''.join(lines))) elif (el.tag == aby_ns+'row'): pass else: print('unexpected tag type' + el.tag) sys.exit(-1) page.clear() i += 1 if len(paragraphs) > 100: # make a chunk! part_str = 'part' + str(part_number).zfill(4) part_str_href = part_str + '.html' tree = make_html('sample title', paragraphs) ebook.add_content({ 'id':part_str, 'href':part_str_href, 'media-type':'application/xhtml+xml' }, common.tree_to_str(tree, xml_declaration=False)) ebook.add_spine_item({ 'idref':part_str }) ebook.add_page_map_item(i, part_str_href) if part_number == 0: ebook.add_guide_item( { 'href':part_str_href, 'type':'text', 'title':'Book' } ) ebook.add_navpoint({ 'text':'Pages', 'content':part_str_href }) part_number += 1 paragraphs = [] # make chunk from last paragraphs if len(paragraphs) > 100: part_str = 'part' + str(part_number).zfill(4) part_str_href = part_str + '.html' tree = make_html('sample title', paragraphs) ebook.add_content({ 'id':part_str, 'href':part_str_href, 'media-type':'application/xhtml+xml' }, common.tree_to_str(tree, xml_declaration=False)) ebook.add_spine_item({ 'idref':part_str }) if part_number == 0: book.add_guide_item( { 'href':part_str_href, 'type':'text', 'title':'Book' } ) ebook.add_navpoint({ 'text':'Pages', 'content':part_str_href })