def commit_ncx_toc(container, toc, lang=None, uid=None): tocname = find_existing_ncx_toc(container) if tocname is None: item = container.generate_item('toc.ncx', id_prefix='toc') tocname = container.href_to_name(item.get('href'), base=container.opf_name) ncx_id = item.get('id') [s.set('toc', ncx_id) for s in container.opf_xpath('//opf:spine')] if not lang: lang = get_lang() for _l in container.opf_xpath('//dc:language'): _l = canonicalize_lang(base.xml2text(_l).strip()) if _l: lang = _l lang = lang_as_iso639_1(_l) or _l break lang = lang_as_iso639_1(lang) or lang if not uid: uid = base.uuid_id() eid = container.opf.get('unique-identifier', None) if eid: m = container.opf_xpath('//*[@id="%s"]' % eid) if m: uid = base.xml2text(m[0]) title = 'Table of Contents' m = container.opf_xpath('//dc:title') if m: x = base.xml2text(m[0]).strip() title = x or title to_href = functools.partial(container.name_to_href, base=tocname) root = create_ncx(toc, to_href, title, lang, uid) container.replace(tocname, root) container.pretty_print.add(tocname)
def norm(x): lc = x.langcode cc = x.countrycode or cc_map.get(lc, None) lc = lang_as_iso639_1(lc) or lc if cc: lc += '-' + cc return lc
def iana2mobi(icode): langdict, subtags = IANA_MOBI[None], [] if icode: subtags = list(icode.split('-')) while len(subtags) > 0: lang = subtags.pop(0).lower() lang = lang_as_iso639_1(lang) if lang and lang in IANA_MOBI: langdict = IANA_MOBI[lang] break mcode = langdict[None] while len(subtags) > 0: subtag = subtags.pop(0) if subtag not in langdict: subtag = subtag.title() if subtag not in langdict: subtag = subtag.upper() if subtag in langdict: mcode = langdict[subtag] break return pack('>HBB', 0, mcode[1], mcode[0])
def html_lang(docx_lang): lang = canonicalize_lang(docx_lang) if lang and lang != 'und': lang = lang_as_iso639_1(lang) if lang: return lang
def migrate_lang_code(self, root): # {{{ from ebook_converter.utils.localization import lang_as_iso639_1 for lang in root.xpath('//*[local-name() = "language"]'): clc = lang_as_iso639_1(lang.text) if clc: lang.text = clc
def fb2_header(self): metadata = {} metadata['title'] = self.oeb_book.metadata.title[0].value metadata['appname'] = __appname__ metadata['version'] = __version__ metadata['date'] = '%i.%i.%i' % ( datetime.now().day, datetime.now().month, datetime.now().year) if self.oeb_book.metadata.language: lc = lang_as_iso639_1(self.oeb_book.metadata.language[0].value) if not lc: lc = self.oeb_book.metadata.language[0].value metadata['lang'] = lc or 'en' else: metadata['lang'] = u'en' metadata['id'] = None metadata['cover'] = self.get_cover() metadata['genre'] = self.opts.fb2_genre metadata['author'] = '' for auth in self.oeb_book.metadata.creator: author_first = '' author_middle = '' author_last = '' author_parts = auth.value.split(' ') if len(author_parts) == 1: author_last = author_parts[0] elif len(author_parts) == 2: author_first = author_parts[0] author_last = author_parts[1] else: author_first = author_parts[0] author_middle = ' '.join(author_parts[1:-1]) author_last = author_parts[-1] metadata['author'] += '<author>' metadata['author'] += ('<first-name>%s</first-name>' % prepare_string_for_xml(author_first)) if author_middle: metadata['author'] += ('<middle-name>%s</middle-name>' % prepare_string_for_xml(author_middle)) metadata['author'] += ('<last-name>%s</last-name>' % prepare_string_for_xml(author_last)) metadata['author'] += '</author>' if not metadata['author']: metadata['author'] = ('<author><first-name></first-name>' '<last-name></last-name></author>') metadata['keywords'] = '' tags = list(map(str, self.oeb_book.metadata.subject)) if tags: tags = ', '.join(prepare_string_for_xml(x) for x in tags) metadata['keywords'] = '<keywords>%s</keywords>' % tags metadata['sequence'] = '' if self.oeb_book.metadata.series: index = '1' if self.oeb_book.metadata.series_index: index = self.oeb_book.metadata.series_index[0] seq = prepare_string_for_xml(str(self.oeb_book.metadata.series[0])) metadata['sequence'] = ('<sequence name="%s" number="%s"/>' % (seq, index)) year = publisher = isbn = '' identifiers = self.oeb_book.metadata['identifier'] for x in identifiers: if (x.get(base.tag('opf', 'scheme'), None).lower() == 'uuid' or str(x).startswith('urn:uuid:')): metadata['id'] = str(x).split(':')[-1] break if metadata['id'] is None: self.log.warn('No UUID identifier found') metadata['id'] = str(uuid.uuid4()) try: date = self.oeb_book.metadata['date'][0] except IndexError: pass else: year = ('<year>%s</year>' % prepare_string_for_xml(date.value.partition('-')[0])) try: publisher = self.oeb_book.metadata['publisher'][0] except IndexError: pass else: publisher = ('<publisher>%s</publisher>' % prepare_string_for_xml(publisher.value)) for x in identifiers: if x.get(base.tag('opf', 'scheme'), None).lower() == 'isbn': isbn = '<isbn>%s</isbn>' % prepare_string_for_xml(x.value) metadata['year'] = year metadata['isbn'] = isbn metadata['publisher'] = publisher for key, value in metadata.items(): if key not in ('author', 'cover', 'sequence', 'keywords', 'year', 'publisher', 'isbn'): metadata[key] = prepare_string_for_xml(value) try: comments = self.oeb_book.metadata['description'][0] except Exception: metadata['comments'] = '' else: from ebook_converter.utils.html2text import html2text annot = prepare_string_for_xml(html2text(comments.value).strip()) metadata['comments'] = f'<annotation><p>{annot}</p></annotation>' # Keep the indentation level of the description the same as the body. header = textwrap.dedent('''\ <FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:l="http://www.w3.org/1999/xlink"> <description> <title-info> <genre>%(genre)s</genre> %(author)s <book-title>%(title)s</book-title> %(cover)s <lang>%(lang)s</lang> %(keywords)s %(sequence)s %(comments)s </title-info> <document-info> %(author)s <program-used>%(appname)s %(version)s</program-used> <date>%(date)s</date> <id>%(id)s</id> <version>1.0</version> </document-info> <publish-info> %(publisher)s %(year)s %(isbn)s </publish-info> </description>''') % metadata # Remove empty lines. return '\n'.join(filter(str.strip, header.splitlines()))
def __init__(self, namespace, log, document_lang): self.namespace = namespace self.document_lang = lang_as_iso639_1(document_lang) or 'en' self.log = log self.block_styles, self.text_styles = {}, {} self.styles_for_html_blocks = {}
def commit_nav_toc(container, toc, lang=None, landmarks=None, previous_nav=None): from ebook_converter.ebooks.oeb.polish.pretty import pretty_xml_tree tocname = find_existing_nav_toc(container) if previous_nav is not None: nav_name = container.href_to_name(previous_nav[0]) if nav_name and container.exists(nav_name): tocname = nav_name container.apply_unique_properties(tocname, 'nav') if tocname is None: item = container.generate_item('nav.xhtml', id_prefix='nav') item.set('properties', 'nav') tocname = container.href_to_name(item.get('href'), base=container.opf_name) if previous_nav is not None: root = previous_nav[1] else: with open( pkg_resources.resource_filename( 'ebook_converter', 'data/new_nav.html')) as fobj: root = container.parse_xhtml(fobj.read()) container.replace(tocname, root) else: root = container.parsed(tocname) if lang: lang = lang_as_iso639_1(lang) or lang root.set('lang', lang) root.set(base.tag('xml', 'lang'), lang) nav = ensure_single_nav_of_type(root, 'toc') if toc.toc_title: nav.append(nav.makeelement(base.tag('xhtml', 'h1'))) nav[-1].text = toc.toc_title rnode = nav.makeelement(base.tag('xhtml', 'ol')) nav.append(rnode) to_href = functools.partial(container.name_to_href, base=tocname) spat = re.compile(r'\s+') def process_node(xml_parent, toc_parent): for child in toc_parent: li = xml_parent.makeelement(base.tag('xhtml', 'li')) xml_parent.append(li) title = child.title or '' title = spat.sub(' ', title).strip() a = li.makeelement(base.tag('xhtml', 'a' if child.dest else 'span')) a.text = title li.append(a) if child.dest: href = to_href(child.dest) if child.frag: href += '#' + child.frag a.set('href', href) if len(child): ol = li.makeelement(base.tag('xhtml', 'ol')) li.append(ol) process_node(ol, child) process_node(rnode, toc) pretty_xml_tree(nav) def collapse_li(parent): for li in parent.iterdescendants(base.tag('xhtml', 'li')): if len(li) == 1: li.text = None li[0].tail = None collapse_li(nav) nav.tail = '\n' def create_li(ol, entry): li = ol.makeelement(base.tag('xhtml', 'li')) ol.append(li) a = li.makeelement(base.tag('xhtml', 'a')) li.append(a) href = container.name_to_href(entry['dest'], tocname) if entry['frag']: href += '#' + entry['frag'] a.set('href', href) return a if landmarks is not None: nav = ensure_single_nav_of_type(root, 'landmarks') nav.set('hidden', '') ol = nav.makeelement(base.tag('xhtml', 'ol')) nav.append(ol) for entry in landmarks: if (entry['type'] and container.has_name(entry['dest']) and container.mime_map[entry['dest']] in base.OEB_DOCS): a = create_li(ol, entry) a.set(base.tag('epub', 'type'), entry['type']) a.text = entry['title'] or None pretty_xml_tree(nav) collapse_li(nav) if toc.page_list: nav = ensure_single_nav_of_type(root, 'page-list') nav.set('hidden', '') ol = nav.makeelement(base.tag('xhtml', 'ol')) nav.append(ol) for entry in toc.page_list: if (container.has_name(entry['dest']) and container.mime_map[entry['dest']] in base.OEB_DOCS): a = create_li(ol, entry) a.text = str(entry['pagenum']) pretty_xml_tree(nav) collapse_li(nav) container.replace(tocname, root)
def build_exth(metadata, prefer_author_sort=False, is_periodical=False, share_not_sync=True, cover_offset=None, thumbnail_offset=None, start_offset=None, mobi_doctype=2, num_of_resources=None, kf8_unknown_count=0, be_kindlegen2=False, kf8_header_index=None, page_progression_direction=None, primary_writing_mode=None): exth = BytesIO() nrecs = 0 for term in metadata: if term not in EXTH_CODES: continue code = EXTH_CODES[term] items = metadata[term] if term == 'creator': if prefer_author_sort: creators = [authors_to_sort_string([str(c)]) for c in items] else: creators = [str(c) for c in items] items = creators elif term == 'rights': try: rights = utf8_text(str(metadata.rights[0])) except Exception: rights = b'Unknown' exth.write(pack(b'>II', EXTH_CODES['rights'], len(rights) + 8)) exth.write(rights) nrecs += 1 continue for item in items: data = str(item) if term != 'description': data = COLLAPSE_RE.sub(' ', data) if term == 'identifier': if data.lower().startswith('urn:isbn:'): data = data[9:] elif item.scheme.lower() == 'isbn': pass else: continue if term == 'language': d2 = lang_as_iso639_1(data) if d2: data = d2 data = utf8_text(data) exth.write(pack(b'>II', code, len(data) + 8)) exth.write(data) nrecs += 1 # Write UUID as ASIN uuid = None for x in metadata['identifier']: if (x.get(base.tag('opf', 'scheme'), None).lower() == 'uuid' or str(x).startswith('urn:uuid:')): uuid = str(x).split(':')[-1] break if uuid is None: from uuid import uuid4 uuid = str(uuid4()) if isinstance(uuid, str): uuid = uuid.encode('utf-8') if not share_not_sync: exth.write(pack(b'>II', 113, len(uuid) + 8)) exth.write(uuid) nrecs += 1 # Write UUID as SOURCE c_uuid = b'calibre:%s' % uuid exth.write(pack(b'>II', 112, len(c_uuid) + 8)) exth.write(c_uuid) nrecs += 1 # Write cdetype if not is_periodical: if not share_not_sync: exth.write(pack(b'>II', 501, 12)) exth.write(b'EBOK') nrecs += 1 else: ids = {0x101: b'NWPR', 0x103: b'MAGZ'}.get(mobi_doctype, None) if ids: exth.write(pack(b'>II', 501, 12)) exth.write(ids) nrecs += 1 # Add a publication date entry if metadata['date']: datestr = str(metadata['date'][0]) elif metadata['timestamp']: datestr = str(metadata['timestamp'][0]) if datestr is None: raise ValueError("missing date or timestamp") datestr = datestr.encode('utf-8') exth.write(pack(b'>II', EXTH_CODES['pubdate'], len(datestr) + 8)) exth.write(datestr) nrecs += 1 if is_periodical: exth.write(pack(b'>II', EXTH_CODES['lastupdatetime'], len(datestr) + 8)) exth.write(datestr) nrecs += 1 if be_kindlegen2: mv = 201 vals = {204: mv, 205: 2, 206: 9, 207: 0} elif is_periodical: # Pretend to be amazon's super secret periodical generator vals = {204: 201, 205: 2, 206: 0, 207: 101} else: # Pretend to be kindlegen 1.2 vals = {204: 201, 205: 1, 206: 2, 207: 33307} for code, val in vals.items(): exth.write(pack(b'>III', code, 12, val)) nrecs += 1 if be_kindlegen2: revnum = b'0730-890adc2' exth.write(pack(b'>II', 535, 8 + len(revnum)) + revnum) nrecs += 1 if cover_offset is not None: exth.write(pack(b'>III', EXTH_CODES['coveroffset'], 12, cover_offset)) exth.write(pack(b'>III', EXTH_CODES['hasfakecover'], 12, 0)) nrecs += 2 if thumbnail_offset is not None: exth.write(pack(b'>III', EXTH_CODES['thumboffset'], 12, thumbnail_offset)) thumbnail_uri_str = ('kindle:embed:%s' % to_base(thumbnail_offset, base=32, min_num_digits=4)).encode('utf-8') exth.write(pack(b'>II', EXTH_CODES['kf8_thumbnail_uri'], len(thumbnail_uri_str) + 8)) exth.write(thumbnail_uri_str) nrecs += 2 if start_offset is not None: try: len(start_offset) except TypeError: start_offset = [start_offset] for so in start_offset: if so is not None: exth.write(pack(b'>III', EXTH_CODES['startreading'], 12, so)) nrecs += 1 if kf8_header_index is not None: exth.write(pack(b'>III', EXTH_CODES['kf8_header_index'], 12, kf8_header_index)) nrecs += 1 if num_of_resources is not None: exth.write(pack(b'>III', EXTH_CODES['num_of_resources'], 12, num_of_resources)) nrecs += 1 if kf8_unknown_count is not None: exth.write(pack(b'>III', EXTH_CODES['kf8_unknown_count'], 12, kf8_unknown_count)) nrecs += 1 if primary_writing_mode: pwm = primary_writing_mode.encode('utf-8') exth.write(pack(b'>II', EXTH_CODES['primary_writing_mode'], len(pwm) + 8)) exth.write(pwm) nrecs += 1 if page_progression_direction in {'rtl', 'ltr', 'default'}: ppd = page_progression_direction.encode('ascii') exth.write(pack(b'>II', EXTH_CODES['page_progression_direction'], len(ppd) + 8)) exth.write(ppd) nrecs += 1 exth = exth.getvalue() trail = len(exth) % 4 pad = b'\0' * (4 - trail) # Always pad w/ at least 1 byte exth = [b'EXTH', pack(b'>II', len(exth) + 12, nrecs), exth, pad] return b''.join(exth)
def metadata_to_xmp_packet(mi): A = ElementMaker(namespace=NS_MAP['x'], nsmap=nsmap('x')) R = ElementMaker(namespace=NS_MAP['rdf'], nsmap=nsmap('rdf')) root = A.xmpmeta(R.RDF) rdf = root[0] dc = rdf.makeelement(expand('rdf:Description'), nsmap=nsmap('dc')) dc.set(expand('rdf:about'), '') rdf.append(dc) for prop, tag in { 'title': 'dc:title', 'comments': 'dc:description' }.items(): val = mi.get(prop) or '' create_alt_property(dc, tag, val) for prop, (tag, ordered) in { 'authors': ('dc:creator', True), 'tags': ('dc:subject', False), 'publisher': ('dc:publisher', False) }.items(): val = mi.get(prop) or () if isinstance(val, (str, bytes)): val = [val] create_sequence_property(dc, tag, val, ordered) if not mi.is_null('pubdate'): # Adobe spec recommends local time create_sequence_property(dc, 'dc:date', [isoformat(mi.pubdate, as_utc=False)]) if not mi.is_null('languages'): langs = list( filter( None, map(lambda x: lang_as_iso639_1(x) or canonicalize_lang(x), mi.languages))) if langs: create_sequence_property(dc, 'dc:language', langs, ordered=False) xmp = rdf.makeelement(expand('rdf:Description'), nsmap=nsmap('xmp', 'xmpidq')) xmp.set(expand('rdf:about'), '') rdf.append(xmp) extra_ids = {} for x in ('prism', 'pdfx'): p = extra_ids[x] = rdf.makeelement(expand('rdf:Description'), nsmap=nsmap(x)) p.set(expand('rdf:about'), '') rdf.append(p) identifiers = mi.get_identifiers() if identifiers: create_identifiers(xmp, identifiers) for scheme, val in identifiers.items(): if scheme in {'isbn', 'doi'}: for prefix, parent in extra_ids.items(): ie = parent.makeelement(expand('%s:%s' % (prefix, scheme))) ie.text = val parent.append(ie) d = xmp.makeelement(expand('xmp:MetadataDate')) d.text = isoformat(now(), as_utc=False) xmp.append(d) calibre = rdf.makeelement(expand('rdf:Description'), nsmap=nsmap('calibre', 'calibreSI', 'calibreCC')) calibre.set(expand('rdf:about'), '') rdf.append(calibre) if not mi.is_null('rating'): try: r = float(mi.rating) except (TypeError, ValueError): pass else: create_simple_property(calibre, 'calibre:rating', '%g' % r) if not mi.is_null('series'): create_series(calibre, mi.series, mi.series_index) if not mi.is_null('timestamp'): create_simple_property(calibre, 'calibre:timestamp', isoformat(mi.timestamp, as_utc=False)) for x in ('author_link_map', 'user_categories'): val = getattr(mi, x, None) if val: create_simple_property(calibre, 'calibre:' + x, dump_dict(val)) for x in ('title_sort', 'author_sort'): if not mi.is_null(x): create_simple_property(calibre, 'calibre:' + x, getattr(mi, x)) all_user_metadata = mi.get_all_user_metadata(True) if all_user_metadata: create_user_metadata(calibre, all_user_metadata) return serialize_xmp_packet(root)
def lang_for_tag(tag): for attr in ('lang', '{http://www.w3.org/XML/1998/namespace}lang'): val = lang_as_iso639_1(tag.get(attr)) if val: return val
def _set_metadata(raw, mi): root = fromstring(raw) namespaces = {'office': OFFICENS, 'meta': METANS, 'dc': DCNS} nsrmap = {v: k for k, v in namespaces.items()} def xpath(expr, parent=root): return parent.xpath(expr, namespaces=namespaces) def remove(*tag_names): for tag_name in tag_names: ns = fields[tag_name][0] tag_name = '{}:{}'.format(nsrmap[ns], tag_name) for x in xpath('descendant::' + tag_name, meta): x.getparent().remove(x) def add(tag, val=None): ans = meta.makeelement('{%s}%s' % fields[tag]) ans.text = val meta.append(ans) return ans def remove_user_metadata(*names): for x in xpath('//meta:user-defined'): q = (x.get('{%s}name' % METANS) or '').lower() if q in names: x.getparent().remove(x) def add_um(name, val, vtype='string'): ans = add('user-defined', val) ans.set('{%s}value-type' % METANS, vtype) ans.set('{%s}name' % METANS, name) def add_user_metadata(name, val): if not hasattr(add_user_metadata, 'sentinel_added'): add_user_metadata.sentinel_added = True remove_user_metadata('opf.metadata') add_um('opf.metadata', 'true', 'boolean') val_type = 'string' if hasattr(val, 'strftime'): val = isoformat(val, as_utc=True).split('T')[0] val_type = 'date' add_um(name, val, val_type) meta = xpath('//office:meta')[0] if not mi.is_null('title'): remove('title') add('title', mi.title) if not mi.is_null('title_sort'): remove_user_metadata('opf.titlesort') add_user_metadata('opf.titlesort', mi.title_sort) if not mi.is_null('authors'): remove('initial-creator', 'creator') val = authors_to_string(mi.authors) add('initial-creator', val), add('creator', val) remove_user_metadata('opf.authors') add_user_metadata('opf.authors', val) if not mi.is_null('author_sort'): remove_user_metadata('opf.authorsort') add_user_metadata('opf.authorsort', mi.author_sort) if not mi.is_null('comments'): remove('description') add('description', mi.comments) if not mi.is_null('tags'): remove('keyword') add('keyword', ', '.join(mi.tags)) if not mi.is_null('languages'): lang = lang_as_iso639_1(mi.languages[0]) if lang: remove('language') add('language', lang) if not mi.is_null('pubdate'): remove_user_metadata('opf.pubdate') add_user_metadata('opf.pubdate', mi.pubdate) if not mi.is_null('publisher'): remove_user_metadata('opf.publisher') add_user_metadata('opf.publisher', mi.publisher) if not mi.is_null('series'): remove_user_metadata('opf.series', 'opf.seriesindex') add_user_metadata('opf.series', mi.series) add_user_metadata('opf.seriesindex', '{}'.format(mi.series_index)) if not mi.is_null('identifiers'): remove_user_metadata('opf.identifiers') add_user_metadata('opf.identifiers', str(json.dumps(mi.identifiers))) if not mi.is_null('rating'): remove_user_metadata('opf.rating') add_user_metadata('opf.rating', '%.2g' % mi.rating) return tostring(root, encoding='utf-8', pretty_print=True)