def __init__(self, href, opf_str = None, container_path = None, package = None): ''' Parameters ---------- href : string Absolute file path of this opf through which it is being opened. Keyword arguments: opf_str -- if available the contents of the OPF file itself to parse (Default None) container_path -- the path of this opf within the epub container (Default None) ''' # The etree root element if opf_str: self.package_el = etree.fromstring(opf_str) #check for a title - old versions of eXe made invalid opfs with no title if self._get_title_el() is None: metadata_el = self.package_el.find(".//{%s}metadata" % EPUBOPF.NAMESPACE_OPF) title_el = etree.SubElement(metadata_el, "{%s}title" % EPUBOPF.NAMESPACE_DC) title_el.text = "Package" else: self.package_el = None # this could be construction of a blank item self.href = href self.nav_el_id = None self.navigation_doc = None self.container_path = container_path self.package = package self.resource_manager = EPUBResourceManager(self.package, self)
def _save_page_html(self, page_id, html_el): page_fd = open(self._get_item_path(page_id), "w") EPUBResourceManager.clean_html_el(html_el) page_fd.write(etree.tostring(html_el, encoding = "UTF-8", pretty_print = True)) page_fd.flush() page_fd.close()
class EPUBOPF(object): ''' classdocs ''' NAMESPACE_OPF = "http://www.idpf.org/2007/opf" NAMESPACE_DC = "http://purl.org/dc/elements/1.1/" '''Maximum number of suffixes we will go through to find a free filename ''' FIND_FILENAME_MAX_ATTEMPTS = 500 def __init__(self, href, opf_str = None, container_path = None, package = None): ''' Parameters ---------- href : string Absolute file path of this opf through which it is being opened. Keyword arguments: opf_str -- if available the contents of the OPF file itself to parse (Default None) container_path -- the path of this opf within the epub container (Default None) ''' # The etree root element if opf_str: self.package_el = etree.fromstring(opf_str) #check for a title - old versions of eXe made invalid opfs with no title if self._get_title_el() is None: metadata_el = self.package_el.find(".//{%s}metadata" % EPUBOPF.NAMESPACE_OPF) title_el = etree.SubElement(metadata_el, "{%s}title" % EPUBOPF.NAMESPACE_DC) title_el.text = "Package" else: self.package_el = None # this could be construction of a blank item self.href = href self.nav_el_id = None self.navigation_doc = None self.container_path = container_path self.package = package self.resource_manager = EPUBResourceManager(self.package, self) def save(self): package_file = open(self.href, "w") package_file.write(etree.tostring(self.package_el, encoding = "UTF-8", pretty_print = True)) package_file.flush() package_file.close() def set_package_changed(self, changed = True): """Set the package changed flag""" if self.package is not None: self.package.isChanged = changed def _get_identifier_el(self): identifier_el = None unique_id = self.package_el.get("unique-identifier") if unique_id is not None: identifier_el = self.package_el.find(".//{%s}identifier[@id='%s']" % (EPUBOPF.NAMESPACE_DC, unique_id)) else: identifier_el = self.package_el.find(".//{%s}identifier" % EPUBOPF.NAMESPACE_DC) return identifier_el def get_opf_id(self): return self._get_identifier_el().text def set_opf_id(self, opf_id, auto_save = True): identifier_el = self._get_identifier_el() identifier_el.text = opf_id if auto_save: self.save() def _get_title_el(self): return self.package_el.find(".//{%s}title" % (EPUBOPF.NAMESPACE_DC)) def get_title(self): return self._get_title_el().text def set_title(self, title, auto_save = True): self._get_title_el().text = title if auto_save: self.save() title = property(get_title, set_title) @property def manifest(self): item_els =self.package_el.findall("./{%(ns)s}manifest/{%(ns)s}item" % {'ns' : EPUBOPF.NAMESPACE_OPF}) manifest = {} for item in item_els: new_item = EPUBOPFItem(item) manifest[item.get("id")] = new_item if new_item.is_nav(): self.nav_el_id = item.get("id") return manifest def add_item_to_manifest(self, item_id, media_type, href, properties = None, auto_save = True): """Adds an item to the OPF file manifest Parameters ---------- item_id : String As per id attribute - must be unique media_type : String """ manifest_el = self.package_el.find("./{%s}manifest" % EPUBOPF.NAMESPACE_OPF) new_item_el = etree.SubElement(manifest_el, "{%s}item" % EPUBOPF.NAMESPACE_OPF, href = href, id = item_id) new_item_el.set("media-type", media_type) if auto_save: self.save() return new_item_el def add_file(self, src_file, path_in_package, auto_update = False, media_type = None, auto_save = True): """ Adds an external file to the manifest in the given location path_in_package _MAY_ be adjusted in case such a file already exists auto_update - If true and the package already contains a file of the same name: then we will overwrite it, otherwise we should look for a new name within the package Returns a tuple containing the manifest item id and the path in package """ if media_type is None: media_type = mimetypes.guess_type(path_in_package)[0] dst_dir = os.path.join(os.path.dirname(self.href), os.path.dirname(path_in_package)) if not os.path.isdir(dst_dir): os.makedirs(dst_dir) shutil.copy2(src_file, os.path.join(os.path.dirname(self.href), path_in_package)) new_file = True if auto_update: if self.contains_href(path_in_package): new_file = False manifest_id = None if new_file: manifest_id = self.get_id_for_href(path_in_package) self.add_item_to_manifest(manifest_id, media_type, path_in_package) else: manifest_id = self.get_item_by_href(path_in_package) return (manifest_id, path_in_package) def handle_item_renamed(self, old_href, new_href, auto_save = True): """Handle when an item in the manifest has been renamed """ item_el = self.package_el.find(".//{%s}item[@href='%s']" % (EPUBOPF.NAMESPACE_OPF, old_href)) itemrefs = self.package_el.findall(".//{%s}itemref[@idref='%s']" % (EPUBOPF.NAMESPACE_OPF, item_el.get("id"))) item_el.set("href", new_href) new_id = self.get_id_for_href(new_href) item_el.set("id", new_id) if itemrefs is not None: for item in itemrefs: item.set("idref", new_id) if auto_save: self.save() def delete_item_by_href(self, href, auto_save = True): """Delete an item from the manifest and removes the file from resourceDir""" item_el = self.package_el.find(".//{%s}item[@href='%s']" % (EPUBOPF.NAMESPACE_OPF, href)) self.delete_item(item_el, auto_save = auto_save) def delete_item_by_id(self, item_id, auto_save = True): item_el = self.package_el.find(".//{%s}item[@id='%s']" % (EPUBOPF.NAMESPACE_OPF, item_id)) self.delete_item(item_el, auto_save = auto_save) def delete_item(self, item_el, auto_save = True): item_id = item_el.get("id") item_el.getparent().remove(item_el) item_path = os.path.join(os.path.dirname(self.href), item_el.get("href")) if os.path.isfile(item_path): os.remove(item_path) spine_itemref = self.package_el.findall(".//{%s}itemref[@idref='%s']") for itemref in spine_itemref: spine_itemref.getparent().remove(itemref) if auto_save: self.save() def get_id_for_href(self, href): #TODO: tidy this into a valid id return href def find_free_filename(self, basename, extension): """Find a free filename for the given basename and extension if already taken generate a new value in the form of basename_NUM.extension Parameters ---------- basename : string The file basename to use e.g. "mypage" extension : string The end extension to use including . e.g. ".xhtml" """ manifest_items = self.manifest suffix = "" for attempt_count in range(0, EPUBOPF.FIND_FILENAME_MAX_ATTEMPTS): current_filename = basename + suffix + extension name_taken = False for id, item in manifest_items.iteritems(): if item.href == current_filename: name_taken = True break if not name_taken: return current_filename suffix = "_%s" % str(attempt_count) return None def get_navigation(self): if self.navigation_doc: return self.navigation_doc if not self.nav_el_id: manifest = self.manifest nav_el = self.package_el.find(".//*[@id='%s']" % self.nav_el_id) nav_doc_path = os.path.join(os.path.dirname(self.href), nav_el.get("href")) nav_doc_str = open(nav_doc_path).read() from exe.engine.epubnav import EPUBNavDocument self.navigation_doc = EPUBNavDocument(self, EPUBOPFItem(nav_el), nav_doc_str, file_path = nav_doc_path) return self.navigation_doc def get_item_by_href(self, href): for id, item in self.manifest.iteritems(): if item.href == href: return item return None def get_item_by_id(self, id): for item_id, item in self.manifest.iteritems(): if item_id == id: return item return None def contains_href(self, href): href_el = self.package_el.find(".//{%s}item[@href='%s']" % (EPUBOPF.NAMESPACE_OPF, href)) return href_el is not None def _get_item_path(self, item_id): """Return the location on the filesystem of the given item id""" item = self.get_item_by_id(item_id) if item is not None: return os.path.join(os.path.dirname(self.href), item.href) else: return None def _get_page_html_el(self, page_id): return etree.parse(self._get_item_path(page_id)).getroot() def _save_page_html(self, page_id, html_el): page_fd = open(self._get_item_path(page_id), "w") EPUBResourceManager.clean_html_el(html_el) page_fd.write(etree.tostring(html_el, encoding = "UTF-8", pretty_print = True)) page_fd.flush() page_fd.close() def set_page_idevice_html(self, page_id, idevice_id, html): """Save the actual HTML content of the idevice""" page_path = self._get_item_path(page_id) page_html_el = self._get_page_html_el(page_id) ns_xhtml = page_html_el.nsmap.get(None) idevice_el = page_html_el.find('.//{%s}*[@id="id%s"]' % (ns_xhtml, idevice_id)) #empty the eleement as it is now for child in idevice_el: idevice_el.remove(child) #process and look for resources that need added to the package html = self.resource_manager.process_previewed_images(html, page_id, idevice_id) #pack it into a single element so it parses OK html = "<div xmlns=\"%s\">%s</div>" % (ns_xhtml, html) soup = BeautifulSoup(html) new_el = etree.fromstring(soup.find("div").prettify(formatter="xml")) #the formatter will add white space - which messes up text areas for textel in new_el.findall(".//{%s}textarea"): if textel.text: textel.text = textel.text.strip() for el in new_el: idevice_el.append(el) self._save_page_html(page_id, page_html_el) def delete_idevice_from_page(self, page_id, idevice_id): page_html_el = self._get_page_html_el(page_id) ns_xhtml = page_html_el.nsmap.get(None) idevice_el = page_html_el.find('.//{%s}*[@id="id%s"]' % (ns_xhtml, idevice_id)) idevice_el.getparent().remove(idevice_el) self._save_page_html(page_id, page_html_el) def update_spine(self, auto_save = True): """ Update the spine to include newly added pages """ spine_el = self.package_el.find(".//{%s}spine" % EPUBOPF.NAMESPACE_OPF) itemref_attribs = {} for itemref in spine_el: itemref_attribs[itemref.get("idref")] = itemref.attrib spine_el.remove(itemref) self.add_navitem_to_spine(spine_el, self.get_navigation(), children_only=True, itemref_attribs = itemref_attribs) if auto_save: self.save() def add_navitem_to_spine(self, spine_el, nav_item, itemref_attribs = {}, children_only = False): """ Adds the given nav_item and it's children to the given spine_el in order - this method uses itself recursively. itemref_attribs: A dictionary where keys given should be the idref which should contain another dictionary of the attributes that should be applied to that spine element. """ if not children_only: itemref_el = etree.SubElement(spine_el, "{%s}itemref" % EPUBOPF.NAMESPACE_OPF) itemref_el.set("idref", nav_item.epub_item.id) if nav_item.epub_item.id in itemref_attribs.keys(): attribs = itemref_attribs[nav_item.epub_item.id] if "linear" in attribs.keys(): itemref_el.set("linear", attribs.get("linear")) if nav_item.children is not None: for child in nav_item.children: self.add_navitem_to_spine(spine_el, child, itemref_attribs = itemref_attribs) def get_spine_item_by_item_id(self, item_id): spine_item_el = self.package_el.find(".//{%(ns)s}spine/{%(ns)s}itemref[@idref=\"%(item_id)s\"]" % \ {"ns" : EPUBOPF.NAMESPACE_OPF, "item_id" : item_id}) return spine_item_el