def get_book_language(container): for lang in container.opf_xpath('//dc:language'): raw = lang.text if raw: code = canonicalize_lang(raw.split(',')[0].strip()) if code: return code
def field_from_string(field, raw, field_metadata): ''' Parse the string raw to return an object that is suitable for calling set() on a Metadata object. ''' dt = field_metadata['datatype'] val = object if dt in {'int', 'float'}: val = int(raw) if dt == 'int' else float(raw) elif dt == 'rating': val = float(raw) * 2 elif dt == 'datetime': from ebook_converter.utils.date import parse_only_date val = parse_only_date(raw) elif dt == 'bool': if raw.lower() in {'true', 'yes', 'y'}: val = True elif raw.lower() in {'false', 'no', 'n'}: val = False else: raise ValueError('Unknown value for %s: %s'%(field, raw)) elif dt == 'text': ism = field_metadata['is_multiple'] if ism: val = [x.strip() for x in raw.split(ism['ui_to_list'])] if field == 'identifiers': val = {x.partition(':')[0]:x.partition(':')[-1] for x in val} elif field == 'languages': from ebook_converter.utils.localization import canonicalize_lang val = [canonicalize_lang(x) for x in val] val = [x for x in val if x] if val is object: val = raw return val
def get_title_sort_pat(lang=None): ans = _title_pats.get(lang, None) if ans is not None: return ans q = lang from ebook_converter.utils.localization import canonicalize_lang, get_lang if lang is None: q = tweaks['default_language_for_title_sort'] if q is None: q = get_lang() q = canonicalize_lang(q) if q else q data = tweaks['per_language_title_sort_articles'] try: ans = data.get(q, None) except AttributeError: ans = None # invalid tweak value try: ans = frozenset(ans) if ans else frozenset(data['eng']) except: ans = frozenset((r'A\s+', r'The\s+', r'An\s+')) ans = '|'.join(ans) ans = '^(%s)' % ans try: ans = re.compile(ans, re.IGNORECASE) except: ans = re.compile(r'^(A|The|An)\s+', re.IGNORECASE) _title_pats[lang] = ans return ans
def commit_ncx_toc(container, toc, lang=None, uid=None): tocname = find_existing_ncx_toc(container) if tocname is None: item = container.generate_item('toc.ncx', id_prefix='toc') tocname = container.href_to_name(item.get('href'), base=container.opf_name) ncx_id = item.get('id') [s.set('toc', ncx_id) for s in container.opf_xpath('//opf:spine')] if not lang: lang = get_lang() for _l in container.opf_xpath('//dc:language'): _l = canonicalize_lang(base.xml2text(_l).strip()) if _l: lang = _l lang = lang_as_iso639_1(_l) or _l break lang = lang_as_iso639_1(lang) or lang if not uid: uid = base.uuid_id() eid = container.opf.get('unique-identifier', None) if eid: m = container.opf_xpath('//*[@id="%s"]' % eid) if m: uid = base.xml2text(m[0]) title = 'Table of Contents' m = container.opf_xpath('//dc:title') if m: x = base.xml2text(m[0]).strip() title = x or title to_href = functools.partial(container.name_to_href, base=tocname) root = create_ncx(toc, to_href, title, lang, uid) container.replace(tocname, root) container.pretty_print.add(tocname)
def read_languages(root, prefixes, refines): ans = [] for lang in XPath('./opf:metadata/dc:language')(root): val = canonicalize_lang((lang.text or '').strip()) if val and val not in ans and val != 'und': ans.append(val) return uniq(ans)
def read_default_style_language(raw, mi, XPath): root = etree.fromstring(raw) for lang in XPath('/w:styles/w:docDefaults/w:rPrDefault/w:rPr/w:lang/' '@w:val')(root): lang = canonicalize_lang(lang) if lang: mi.languages = [lang] break
def get_comic_book_info(d, mi, series_index='volume'): # See http://code.google.com/p/comicbookinfo/wiki/Example series = d.get('series', '') if series.strip(): mi.series = series si = d.get(series_index, None) if si is None: si = d.get('issue' if series_index == 'volume' else 'volume', None) if si is not None: try: mi.series_index = float(si) except Exception: mi.series_index = 1 if d.get('language', None): lang = canonicalize_lang(d.get('lang')) if lang: mi.languages = [lang] if d.get('rating', -1) > -1: mi.rating = d['rating'] for x in ('title', 'publisher'): y = d.get(x, '').strip() if y: setattr(mi, x, y) tags = d.get('tags', []) if tags: mi.tags = tags authors = [] for credit in d.get('credits', []): if credit.get('role', '') in ('Writer', 'Artist', 'Cartoonist', 'Creator'): x = credit.get('person', '') if x: x = ' '.join((reversed(x.split(', ')))) authors.append(x) if authors: mi.authors = authors comments = d.get('comments', '') if comments and comments.strip(): mi.comments = comments.strip() pubm, puby = d.get('publicationMonth', None), d.get('publicationYear', None) if puby is not None: from ebook_converter.utils.date import parse_only_date from datetime import date try: dt = date(puby, 6 if pubm is None else pubm, 15) dt = parse_only_date(str(dt)) mi.pubdate = dt except Exception: pass
def parse_lang_code(raw): raw = raw or '' parts = raw.replace('_', '-').split('-') lc = canonicalize_lang(parts[0]) if lc is None: raise ValueError('Invalid language code: %r' % raw) cc = None if len(parts) > 1: ccodes, ccodemap = get_codes()[:2] q = parts[1].upper() if q in ccodes: cc = q else: cc = ccodemap.get(q, None) return DictionaryLocale(lc, cc)
def read_doc_props(raw, mi, XPath): root = etree.fromstring(raw) titles = XPath('//dc:title')(root) if titles: title = titles[0].text if title and title.strip(): mi.title = title.strip() tags = [] for subject in XPath('//dc:subject')(root): if subject.text and subject.text.strip(): tags.append(subject.text.strip().replace(',', '_')) for keywords in XPath('//cp:keywords')(root): if keywords.text and keywords.text.strip(): for x in keywords.text.split(): tags.extend(y.strip() for y in x.split(',') if y.strip()) if tags: mi.tags = tags authors = XPath('//dc:creator')(root) aut = [] for author in authors: if author.text and author.text.strip(): aut.extend(string_to_authors(author.text)) if aut: mi.authors = aut mi.author_sort = authors_to_sort_string(aut) desc = XPath('//dc:description')(root) if desc: raw = etree.tostring(desc[0], method='text', encoding='unicode') # Word 2007 mangles newlines in the summary raw = raw.replace('_x000d_', '') mi.comments = raw.strip() langs = [] for lang in XPath('//dc:language')(root): if lang.text and lang.text.strip(): canonic_lang = canonicalize_lang(lang.text) if canonic_lang: langs.append(canonic_lang) if langs: mi.languages = langs
def create_oebbook(self, htmlpath, basedir, opts, log, mi): import uuid from ebook_converter.ebooks.conversion.plumber import create_oebbook from ebook_converter.ebooks.oeb.base import (DirContainer, rewrite_links, urlnormalize, BINARY_MIME, OEB_STYLES, xpath, urlquote) from ebook_converter.ebooks.oeb.transforms.metadata import \ meta_info_to_oeb_metadata from ebook_converter.ebooks.html.input import get_filelist from ebook_converter.ebooks.metadata import string_to_authors from ebook_converter.utils.localization import canonicalize_lang import css_parser, logging css_parser.log.setLevel(logging.WARN) self.OEB_STYLES = OEB_STYLES oeb = create_oebbook(log, None, opts, self, encoding=opts.input_encoding, populate=False) self.oeb = oeb metadata = oeb.metadata meta_info_to_oeb_metadata(mi, metadata, log) if not metadata.language: l = canonicalize_lang(getattr(opts, 'language', None)) if not l: oeb.logger.warn('Language not specified') l = get_lang().replace('_', '-') metadata.add('language', l) if not metadata.creator: a = getattr(opts, 'authors', None) if a: a = string_to_authors(a) if not a: oeb.logger.warn('Creator not specified') a = [self.oeb.translate('Unknown')] for aut in a: metadata.add('creator', aut) if not metadata.title: oeb.logger.warn('Title not specified') metadata.add('title', self.oeb.translate('Unknown')) bookid = str(uuid.uuid4()) metadata.add('identifier', bookid, id='uuid_id', scheme='uuid') for ident in metadata.identifier: if 'id' in ident.attrib: self.oeb.uid = metadata.identifier[0] break filelist = get_filelist(htmlpath, basedir, opts, log) filelist = [f for f in filelist if not f.is_binary] htmlfile_map = {} for f in filelist: path = f.path oeb.container = DirContainer(os.path.dirname(path), log, ignore_opf=True) bname = os.path.basename(path) id, href = oeb.manifest.generate(id='html', href=sanitize_file_name(bname)) htmlfile_map[path] = href item = oeb.manifest.add(id, href, 'text/html') if path == htmlpath and '%' in path: bname = urlquote(bname) item.html_input_href = bname oeb.spine.add(item, True) self.added_resources = {} self.log = log self.log('Normalizing filename cases') for path, href in htmlfile_map.items(): self.added_resources[path] = href self.urlnormalize, self.DirContainer = urlnormalize, DirContainer self.urldefrag = urllib.parse.urldefrag self.BINARY_MIME = BINARY_MIME self.log('Rewriting HTML links') for f in filelist: path = f.path dpath = os.path.dirname(path) oeb.container = DirContainer(dpath, log, ignore_opf=True) href = htmlfile_map[path] try: item = oeb.manifest.hrefs[href] except KeyError: item = oeb.manifest.hrefs[urlnormalize(href)] rewrite_links(item.data, functools.partial(self.resource_adder, base=dpath)) for item in oeb.manifest.values(): if item.media_type in self.OEB_STYLES: dpath = None for path, href in self.added_resources.items(): if href == item.href: dpath = os.path.dirname(path) break css_parser.replaceUrls(item.data, functools.partial(self.resource_adder, base=dpath)) toc = self.oeb.toc self.oeb.auto_generated_toc = True titles = [] headers = [] for item in self.oeb.spine: if not item.linear: continue html = item.data title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) title = re.sub(r'\s+', ' ', title.strip()) if title: titles.append(title) headers.append('(unlabled)') for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): expr = '/h:html/h:body//h:%s[position()=1]/text()' header = ''.join(xpath(html, expr % tag)) header = re.sub(r'\s+', ' ', header.strip()) if header: headers[-1] = header break use = titles if len(titles) > len(set(titles)): use = headers for title, item in zip(use, self.oeb.spine): if not item.linear: continue toc.add(title, item.href) oeb.container = DirContainer(os.getcwd(), oeb.log, ignore_opf=True) return oeb
def html_lang(docx_lang): lang = canonicalize_lang(docx_lang) if lang and lang != 'und': lang = lang_as_iso639_1(lang) if lang: return lang
def __init__(self, raw, codec, title): self.doctype = raw[:4] self.length, self.num_items = struct.unpack('>LL', raw[4:12]) raw = raw[12:] pos = 0 self.mi = MetaInformation('Unknown', ['Unknown']) self.has_fake_cover = True self.start_offset = None left = self.num_items self.kf8_header = None self.uuid = self.cdetype = None self.page_progression_direction = None self.primary_writing_mode = None self.decode = lambda x: clean_ascii_chars(x.decode(codec, 'replace')) while left > 0: left -= 1 idx, size = struct.unpack('>LL', raw[pos:pos + 8]) content = raw[pos + 8:pos + size] pos += size if idx >= 100 and idx < 200: self.process_metadata(idx, content, codec) elif idx == 203: self.has_fake_cover = bool(struct.unpack('>L', content)[0]) elif idx == 201: co, = struct.unpack('>L', content) if co < NULL_INDEX: self.cover_offset = co elif idx == 202: self.thumbnail_offset, = struct.unpack('>L', content) elif idx == 501: try: self.cdetype = content.decode('ascii') except UnicodeDecodeError: self.cdetype = None # cdetype if content == b'EBSP': if not self.mi.tags: self.mi.tags = [] self.mi.tags.append('Sample Book') elif idx == 502: # last update time pass elif idx == 503: # Long title # Amazon seems to regard this as the definitive book title # rather than the title from the PDB header. In fact when # sending MOBI files through Amazon's email service if the # title contains non ASCII chars or non filename safe chars # they are messed up in the PDB header try: title = self.decode(content) except Exception: pass elif idx == 524: # Lang code try: lang = content.decode(codec) lang = canonicalize_lang(lang) if lang: self.mi.language = lang except Exception: pass elif idx == 525: try: pwm = content.decode(codec) if pwm: self.primary_writing_mode = pwm except Exception: pass elif idx == 527: try: ppd = content.decode(codec) if ppd: self.page_progression_direction = ppd except Exception: pass # else: # print 'unknown record', idx, repr(content) if title: title = clean_xml_chars(clean_ascii_chars(title)) self.mi.title = entities.replace_entities(title)
def run(self, path_to_output, opts, db, notification=DummyReporter()): from ebook_converter.library.catalogs.epub_mobi_builder import CatalogBuilder from ebook_converter.utils.logging import default_log as log from ebook_converter.utils.config import JSONConfig # If preset specified from the cli, insert stored options from JSON file if hasattr(opts, 'preset') and opts.preset: available_presets = JSONConfig("catalog_presets") if opts.preset not in available_presets: if available_presets: print('Error: Preset "%s" not found.' % opts.preset) print('Stored presets: %s' % ', '.join([p for p in sorted(available_presets.keys())])) else: print('Error: No stored presets.') return 1 # Copy the relevant preset values to the opts object for item in available_presets[opts.preset]: if item not in ['exclusion_rules_tw', 'format', 'prefix_rules_tw']: setattr(opts, item, available_presets[opts.preset][item]) # Provide an unconnected device opts.connected_device = { 'is_device_connected': False, 'kind': None, 'name': None, 'save_template': None, 'serial': None, 'storage': None, } # Convert prefix_rules and exclusion_rules from JSON lists to tuples prs = [] for rule in opts.prefix_rules: prs.append(tuple(rule)) opts.prefix_rules = tuple(prs) ers = [] for rule in opts.exclusion_rules: ers.append(tuple(rule)) opts.exclusion_rules = tuple(ers) opts.log = log opts.fmt = self.fmt = path_to_output.rpartition('.')[2] # Add local options opts.creator = '%s, %s %s, %s' % (date.strftime('%A'), date.strftime('%B'), date.strftime('%d').lstrip('0'), date.strftime('%Y')) opts.creator_sort_as = '%s %s' % ('calibre', date.strftime('%Y-%m-%d')) opts.connected_kindle = False # Finalize output_profile op = opts.output_profile if op is None: op = 'default' if opts.connected_device['name'] and 'kindle' in opts.connected_device['name'].lower(): opts.connected_kindle = True if opts.connected_device['serial'] and \ opts.connected_device['serial'][:4] in ['B004', 'B005']: op = "kindle_dx" else: op = "kindle" opts.description_clip = 380 if op.endswith('dx') or 'kindle' not in op else 100 opts.author_clip = 100 if op.endswith('dx') or 'kindle' not in op else 60 opts.output_profile = op opts.basename = "Catalog" opts.cli_environment = not hasattr(opts, 'sync') # Hard-wired to always sort descriptions by author, with series after non-series opts.sort_descriptions_by_author = True build_log = [] build_log.append("%s('%s'): Generating %s %sin %s environment, locale: '%s'" % (self.name, current_library_name(), self.fmt, 'for %s ' % opts.output_profile if opts.output_profile else '', 'CLI' if opts.cli_environment else 'GUI', langcode_to_name(canonicalize_lang(get_lang()), localize=False)) ) # If exclude_genre is blank, assume user wants all tags as genres if opts.exclude_genre.strip() == '': # opts.exclude_genre = '\[^.\]' # build_log.append(" converting empty exclude_genre to '\[^.\]'") opts.exclude_genre = 'a^' build_log.append(" converting empty exclude_genre to 'a^'") if opts.connected_device['is_device_connected'] and \ opts.connected_device['kind'] == 'device': if opts.connected_device['serial']: build_log.append(" connected_device: '%s' #%s%s " % (opts.connected_device['name'], opts.connected_device['serial'][0:4], 'x' * (len(opts.connected_device['serial']) - 4))) for storage in opts.connected_device['storage']: if storage: build_log.append(" mount point: %s" % storage) else: build_log.append(" connected_device: '%s'" % opts.connected_device['name']) try: for storage in opts.connected_device['storage']: if storage: build_log.append(" mount point: %s" % storage) except: build_log.append(" (no mount points)") else: build_log.append(" connected_device: '%s'" % opts.connected_device['name']) opts_dict = vars(opts) if opts_dict['ids']: build_log.append(" book count: %d" % len(opts_dict['ids'])) sections_list = [] if opts.generate_authors: sections_list.append('Authors') if opts.generate_titles: sections_list.append('Titles') if opts.generate_series: sections_list.append('Series') if opts.generate_genres: sections_list.append('Genres') if opts.generate_recently_added: sections_list.append('Recently Added') if opts.generate_descriptions: sections_list.append('Descriptions') if not sections_list: if opts.cli_environment: opts.log.warn('*** No Section switches specified, enabling all Sections ***') opts.generate_authors = True opts.generate_titles = True opts.generate_series = True opts.generate_genres = True opts.generate_recently_added = True opts.generate_descriptions = True sections_list = ['Authors', 'Titles', 'Series', 'Genres', 'Recently Added', 'Descriptions'] else: opts.log.warn('\n*** No enabled Sections, terminating catalog generation ***') return ["No Included Sections", "No enabled Sections.\nCheck E-book options tab\n'Included sections'\n"] if opts.fmt == 'mobi' and sections_list == ['Descriptions']: warning = ("\n*** Adding 'By authors' section required for MOBI " "output ***") opts.log.warn(warning) sections_list.insert(0, 'Authors') opts.generate_authors = True opts.log(" Sections: %s" % ', '.join(sections_list)) opts.section_list = sections_list # Limit thumb_width to 1.0" - 2.0" try: if float(opts.thumb_width) < float(self.THUMB_SMALLEST): log.warning("coercing thumb_width from '%s' to '%s'" % (opts.thumb_width, self.THUMB_SMALLEST)) opts.thumb_width = self.THUMB_SMALLEST if float(opts.thumb_width) > float(self.THUMB_LARGEST): log.warning("coercing thumb_width from '%s' to '%s'" % (opts.thumb_width, self.THUMB_LARGEST)) opts.thumb_width = self.THUMB_LARGEST opts.thumb_width = "%.2f" % float(opts.thumb_width) except: log.error("coercing thumb_width from '%s' to '%s'" % (opts.thumb_width, self.THUMB_SMALLEST)) opts.thumb_width = "1.0" # eval prefix_rules if passed from command line if type(opts.prefix_rules) is not tuple: try: opts.prefix_rules = eval(opts.prefix_rules) except: log.error("malformed --prefix-rules: %s" % opts.prefix_rules) raise for rule in opts.prefix_rules: if len(rule) != 4: log.error("incorrect number of args for --prefix-rules: %s" % repr(rule)) # eval exclusion_rules if passed from command line if type(opts.exclusion_rules) is not tuple: try: opts.exclusion_rules = eval(opts.exclusion_rules) except: log.error("malformed --exclusion-rules: %s" % opts.exclusion_rules) raise for rule in opts.exclusion_rules: if len(rule) != 3: log.error("incorrect number of args for --exclusion-rules: %s" % repr(rule)) # Display opts keys = sorted(opts_dict.keys()) build_log.append(" opts:") for key in keys: if key in ['catalog_title', 'author_clip', 'connected_kindle', 'creator', 'cross_reference_authors', 'description_clip', 'exclude_book_marker', 'exclude_genre', 'exclude_tags', 'exclusion_rules', 'fmt', 'genre_source_field', 'header_note_source_field', 'merge_comments_rule', 'output_profile', 'prefix_rules', 'preset', 'read_book_marker', 'search_text', 'sort_by', 'sort_descriptions_by_author', 'sync', 'thumb_width', 'use_existing_cover', 'wishlist_tag']: build_log.append(" %s: %s" % (key, repr(opts_dict[key]))) if opts.verbose: log('\n'.join(line for line in build_log)) # Capture start_time opts.start_time = time.time() self.opts = opts if opts.verbose: log.info(" Begin catalog source generation (%s)" % str(datetime.timedelta(seconds=int(time.time() - opts.start_time)))) # Launch the Catalog builder catalog = CatalogBuilder(db, opts, self, report_progress=notification) try: catalog.build_sources() if opts.verbose: log.info(" Completed catalog source generation (%s)\n" % str(datetime.timedelta(seconds=int(time.time() - opts.start_time)))) except (AuthorSortMismatchException, EmptyCatalogException) as e: log.error(" *** Terminated catalog generation: %s ***" % e) except: log.error(" unhandled exception in catalog generator") raise else: recommendations = [] recommendations.append(('remove_fake_margins', False, OptionRecommendation.HIGH)) recommendations.append(('comments', '', OptionRecommendation.HIGH)) """ >>> Use to debug generated catalog code before pipeline conversion <<< """ GENERATE_DEBUG_EPUB = False if GENERATE_DEBUG_EPUB: catalog_debug_path = os.path.join(os.path.expanduser('~'), 'Desktop', 'Catalog debug') setattr(opts, 'debug_pipeline', os.path.expanduser(catalog_debug_path)) dp = getattr(opts, 'debug_pipeline', None) if dp is not None: recommendations.append(('debug_pipeline', dp, OptionRecommendation.HIGH)) if opts.output_profile and opts.output_profile.startswith("kindle"): recommendations.append(('output_profile', opts.output_profile, OptionRecommendation.HIGH)) recommendations.append(('book_producer', opts.output_profile, OptionRecommendation.HIGH)) if opts.fmt == 'mobi': recommendations.append(('no_inline_toc', True, OptionRecommendation.HIGH)) recommendations.append(('verbose', 2, OptionRecommendation.HIGH)) # Use existing cover or generate new cover cpath = None existing_cover = False try: search_text = 'title:"%s" author:%s' % ( opts.catalog_title.replace('"', '\\"'), 'calibre') matches = db.search(search_text, return_matches=True, sort_results=False) if matches: cpath = db.cover(matches[0], index_is_id=True, as_path=True) if cpath and os.path.exists(cpath): existing_cover = True except: pass if self.opts.use_existing_cover and not existing_cover: log.warning("no existing catalog cover found") if self.opts.use_existing_cover and existing_cover: recommendations.append(('cover', cpath, OptionRecommendation.HIGH)) log.info("using existing catalog cover") else: # TODO(gryf): feature: generating cover with pillow. pass # from ebook_converter.ebooks.covers import calibre_cover2 # log.info("replacing catalog cover") # new_cover_path = PersistentTemporaryFile(suffix='.jpg') # # new_cover = calibre_cover2(opts.catalog_title, 'calibre') # new_cover_path.write('') # new_cover_path.close() # recommendations.append(('cover', new_cover_path.name, OptionRecommendation.HIGH)) # Run ebook-convert from ebook_converter.ebooks.conversion.plumber import Plumber plumber = Plumber(os.path.join(catalog.catalog_path, opts.basename + '.opf'), path_to_output, log, report_progress=notification, abort_after_input_dump=False) plumber.merge_ui_recommendations(recommendations) plumber.run() try: os.remove(cpath) except: pass if GENERATE_DEBUG_EPUB: from ebook_converter.ebooks.epub import initialize_container from ebook_converter.ebooks.tweak import zip_rebuilder from ebook_converter.utils.zipfile import ZipFile input_path = os.path.join(catalog_debug_path, 'input') epub_shell = os.path.join(catalog_debug_path, 'epub_shell.zip') initialize_container(epub_shell, opf_name='content.opf') with ZipFile(epub_shell, 'r') as zf: zf.extractall(path=input_path) os.remove(epub_shell) zip_rebuilder(input_path, os.path.join(catalog_debug_path, 'input.epub')) if opts.verbose: log.info(" Catalog creation complete (%s)\n" % str(datetime.timedelta(seconds=int(time.time() - opts.start_time)))) # returns to gui2.actions.catalog:catalog_generated() return catalog.error
def metadata_to_xmp_packet(mi): A = ElementMaker(namespace=NS_MAP['x'], nsmap=nsmap('x')) R = ElementMaker(namespace=NS_MAP['rdf'], nsmap=nsmap('rdf')) root = A.xmpmeta(R.RDF) rdf = root[0] dc = rdf.makeelement(expand('rdf:Description'), nsmap=nsmap('dc')) dc.set(expand('rdf:about'), '') rdf.append(dc) for prop, tag in { 'title': 'dc:title', 'comments': 'dc:description' }.items(): val = mi.get(prop) or '' create_alt_property(dc, tag, val) for prop, (tag, ordered) in { 'authors': ('dc:creator', True), 'tags': ('dc:subject', False), 'publisher': ('dc:publisher', False) }.items(): val = mi.get(prop) or () if isinstance(val, (str, bytes)): val = [val] create_sequence_property(dc, tag, val, ordered) if not mi.is_null('pubdate'): # Adobe spec recommends local time create_sequence_property(dc, 'dc:date', [isoformat(mi.pubdate, as_utc=False)]) if not mi.is_null('languages'): langs = list( filter( None, map(lambda x: lang_as_iso639_1(x) or canonicalize_lang(x), mi.languages))) if langs: create_sequence_property(dc, 'dc:language', langs, ordered=False) xmp = rdf.makeelement(expand('rdf:Description'), nsmap=nsmap('xmp', 'xmpidq')) xmp.set(expand('rdf:about'), '') rdf.append(xmp) extra_ids = {} for x in ('prism', 'pdfx'): p = extra_ids[x] = rdf.makeelement(expand('rdf:Description'), nsmap=nsmap(x)) p.set(expand('rdf:about'), '') rdf.append(p) identifiers = mi.get_identifiers() if identifiers: create_identifiers(xmp, identifiers) for scheme, val in identifiers.items(): if scheme in {'isbn', 'doi'}: for prefix, parent in extra_ids.items(): ie = parent.makeelement(expand('%s:%s' % (prefix, scheme))) ie.text = val parent.append(ie) d = xmp.makeelement(expand('xmp:MetadataDate')) d.text = isoformat(now(), as_utc=False) xmp.append(d) calibre = rdf.makeelement(expand('rdf:Description'), nsmap=nsmap('calibre', 'calibreSI', 'calibreCC')) calibre.set(expand('rdf:about'), '') rdf.append(calibre) if not mi.is_null('rating'): try: r = float(mi.rating) except (TypeError, ValueError): pass else: create_simple_property(calibre, 'calibre:rating', '%g' % r) if not mi.is_null('series'): create_series(calibre, mi.series, mi.series_index) if not mi.is_null('timestamp'): create_simple_property(calibre, 'calibre:timestamp', isoformat(mi.timestamp, as_utc=False)) for x in ('author_link_map', 'user_categories'): val = getattr(mi, x, None) if val: create_simple_property(calibre, 'calibre:' + x, dump_dict(val)) for x in ('title_sort', 'author_sort'): if not mi.is_null(x): create_simple_property(calibre, 'calibre:' + x, getattr(mi, x)) all_user_metadata = mi.get_all_user_metadata(True) if all_user_metadata: create_user_metadata(calibre, all_user_metadata) return serialize_xmp_packet(root)
def get_metadata(stream, extract_cover=True): whitespace = re.compile(r'\s+') def normalize(s): return whitespace.sub(' ', s).strip() with ZipFile(stream) as zf: meta = zf.read('meta.xml') root = fromstring(meta) def find(field): ns, tag = fields[field] ans = root.xpath('//ns0:{}'.format(tag), namespaces={'ns0': ns}) if ans: return normalize( tostring(ans[0], method='text', encoding='unicode', with_tail=False)).strip() mi = MetaInformation(None, []) title = find('title') if title: mi.title = title creator = find('initial-creator') or find('creator') if creator: mi.authors = string_to_authors(creator) desc = find('description') if desc: mi.comments = desc lang = find('language') if lang and canonicalize_lang(lang): mi.languages = [canonicalize_lang(lang)] kw = find('keyword') or find('keywords') if kw: mi.tags = [x.strip() for x in kw.split(',') if x.strip()] data = {} for tag in root.xpath('//ns0:user-defined', namespaces={'ns0': fields['user-defined'][0]}): name = (tag.get('{%s}name' % METANS) or '').lower() vtype = tag.get('{%s}value-type' % METANS) or 'string' val = tag.text if name and val: if vtype == 'boolean': val = val == 'true' data[name] = val opfmeta = False # we need this later for the cover opfnocover = False if data.get('opf.metadata'): # custom metadata contains OPF information opfmeta = True if data.get('opf.titlesort', ''): mi.title_sort = data['opf.titlesort'] if data.get('opf.authors', ''): mi.authors = string_to_authors(data['opf.authors']) if data.get('opf.authorsort', ''): mi.author_sort = data['opf.authorsort'] if data.get('opf.isbn', ''): isbn = check_isbn(data['opf.isbn']) if isbn is not None: mi.isbn = isbn if data.get('opf.publisher', ''): mi.publisher = data['opf.publisher'] if data.get('opf.pubdate', ''): mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True) if data.get('opf.identifiers'): try: mi.identifiers = json.loads(data['opf.identifiers']) except Exception: pass if data.get('opf.rating'): try: mi.rating = max(0, min(float(data['opf.rating']), 10)) except Exception: pass if data.get('opf.series', ''): mi.series = data['opf.series'] if data.get('opf.seriesindex', ''): try: mi.series_index = float(data['opf.seriesindex']) except Exception: mi.series_index = 1.0 if data.get('opf.language', ''): cl = canonicalize_lang(data['opf.language']) if cl: mi.languages = [cl] opfnocover = data.get('opf.nocover', False) if not opfnocover: try: read_cover(stream, zf, mi, opfmeta, extract_cover) except Exception: pass # Do not let an error reading the cover prevent reading other data return mi