def performPageMapUpdates(data, newbkpath, oldbkpath, keylist, valuelist): data = _remove_xml_header(data) # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8 data = data.encode('utf-8') # rebuild serialized lookup dictionary of xml_updates properly adjusted updates = OrderedDict() for i in range(0, len(keylist)): updates[ keylist[i] ] = valuelist[i] xml_empty_tags = ["page"] xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=xml_empty_tags) soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder) for tag in soup.find_all(["page"]): for att in ["href"]: if att in tag.attrs : ref = tag[att] if ref.find(":") == -1 : parts = ref.split('#') apath = urldecodepart(parts[0]) fragment = "" if len(parts) > 1: fragment = urldecodepart(parts[1]) oldtarget = buildBookPath(apath, startingDir(oldbkpath)) newtarget = updates.get(oldtarget, oldtarget) attribute_value = urlencodepart(buildRelativePath(newbkpath, newtarget)) if fragment != "": attribute_value = attribute_value + "#" + urlencodepart(fragment) tag[att] = attribute_value newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=" ") return newdata
def anchorNCXUpdates(data, ncx_bookpath, originating_bookpath, keylist, valuelist): data = _remove_xml_header(data) # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8 data = data.encode('utf-8') # rebuild serialized lookup dictionary id_dict = OrderedDict() for i in range(0, len(keylist)): id_dict[ keylist[i] ] = valuelist[i] startdir = startingDir(ncx_bookpath) xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=ebook_xml_empty_tags) soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder) for tag in soup.find_all("content"): if "src" in tag.attrs: src = tag["src"] if src.find(":") == -1: parts = src.split('#') apath = urldecodepart(parts[0]) # convert this path to its target bookpath target_bookpath = buildBookPath(apath, startdir) if (parts is not None) and (len(parts) > 1) and (target_bookpath == originating_bookpath) and (parts[1] != ""): fragment_id = urldecodepart(parts[1]) if fragment_id in id_dict: target_bookpath = id_dict[fragment_id] attribute_value = urlencodepart(buildRelativePath(ncx_bookpath, target_bookpath)) attribute_value = attribute_value + "#" + urlencodepart(fragment_id) tag["src"] = attribute_value; newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=" ") return newdata
def anchorNCXUpdatesAfterMerge(data, ncx_bookpath, sink_bookpath, merged_bookpaths): data = _remove_xml_header(data) startdir = startingDir(ncx_bookpath) # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8 data = data.encode('utf-8') xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=ebook_xml_empty_tags) soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder) for tag in soup.find_all("content"): if "src" in tag.attrs: src = tag["src"] if src.find(":") == -1: parts = src.split('#') if parts is not None: apath = urldecodepart(parts[0]) target_bookpath = buildBookPath(apath, startdir) if target_bookpath in merged_bookpaths: attribute_value = urlencodepart( buildRelativePath(ncx_bookpath, sink_bookpath)) if len(parts) > 1 and parts[1] != "": fragment = urldecodepart(parts[1]) attribute_value += "#" + urlencodepart(parts[1]) tag["src"] = attribute_value newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=" ") return newdata
def deleteotherfile(self, book_href): id = _unicodestr(book_href) id = urldecodepart(id) if id is None: raise WrapperException('None is not a valid book hrefbook href') if id not in self.other and id in self.id_to_href: raise WrapperException('Incorrect interface routine - use deletefile') filepath = self.book_href_to_filepath.get(id, None) if filepath is None: raise WrapperException('Book href does not exist') if id in PROTECTED_FILES or id == self.opfbookpath: raise WrapperException('attempt to delete protected file') add_to_deleted = True # if file was added or modified delete file from outdir if id in self.added or id in self.modified: filepath = os.path.join(self.outdir, filepath) if os.path.exists(filepath) and os.path.isfile(filepath): os.remove(filepath) if id in self.added: self.added.remove(id) add_to_deleted = False if id in self.other: self.other.remove(id) if id in self.modified: del self.modified[id] if add_to_deleted: self.deleted.append(('other', id, book_href)) del self.book_href_to_filepath[id]
def readotherfile(self, book_href): id = _unicodestr(book_href) id = urldecodepart(id) if id is None: raise WrapperException('None is not a valid book href') if id not in self.other and id in self.id_to_href: raise WrapperException('Incorrect interface routine - use readfile') # handle special case of trying to read the opf after it has been modified if id == self.opfbookpath: if id in self.modified: return self.build_opf() filepath = self.book_href_to_filepath.get(id, None) if filepath is None: raise WrapperException('Book href does not exist') basedir = self.ebook_root if id in self.added or id in self.modified: basedir = self.outdir filepath = os.path.join(basedir, filepath) if not os.path.exists(filepath): raise WrapperException('File Does Not Exist') basename = os.path.basename(filepath) ext = os.path.splitext(basename)[1] ext = ext.lower() mime = ext_mime_map.get(ext, "") data = b'' with open(filepath, 'rb') as fp: data = fp.read() if mime in TEXT_MIMETYPES: data = _unicodestr(data) return data
def getmime(self, href): href = _unicodestr(href) href = urldecodepart(href) filename = os.path.basename(href) ext = os.path.splitext(filename)[1] ext = ext.lower() return ext_mime_map.get(ext, "")
def _parseData(self): cnt = 0 for prefix, tname, tattr, tcontent in self._opf_tag_iter(): # package if tname == "package": ver = tattr.pop("version", "2.0") uid = tattr.pop("unique-identifier", "bookid") if self.ns_remap: if "xmlns:opf" in tattr: tattr.pop("xmlns:opf") tattr["xmlns"] = "http://www/idpf.org/2007/opf" self.package = (ver, uid, tattr) continue # metadata if tname == "metadata": if self.ns_remap: if not "xmlns:opf" in tattr: tattr["xmlns:opf"] = "http://www/idpf.org/2007/opf" self.metadata_attr = tattr continue if tname in ["meta", "link" ] or tname.startswith("dc:") and "metadata" in prefix: self.metadata.append((tname, tcontent, tattr)) continue # manifest if tname == "item" and "manifest" in prefix: nid = "xid%03d" % cnt cnt += 1 id = tattr.pop("id", nid) # must keep all hrefs in encoded) form # if relative, then no fragments so decode and then encode for safety href = tattr.pop("href", "") if href.find(':') == -1: href = urldecodepart(href) href = urlencodepart(href) mtype = tattr.pop("media-type", "") self.manifest.append((id, href, mtype, tattr)) continue # spine if tname == "spine": self.spine_attr = tattr continue if tname == "itemref" and "spine" in prefix: idref = tattr.pop("idref", "") self.spine.append((idref, tattr)) continue # guide if tname == "reference" and "guide" in prefix: type = tattr.pop("type", "") title = tattr.pop("title", "") # must keep all hrefs in quoted (encoded) form href = tattr.pop("href", "") self.guide.append((type, title, href)) continue # bindings if tname in ["mediaType", "mediatype"] and "bindings" in prefix: mtype = tattr.pop("media-type", "") handler = tattr.pop("handler", "") self.bindings.append((mtype, handler)) continue
def setguide(self, new_guide): guide = [] for (type, title, href) in new_guide: type = _unicodestr(type) title = _unicodestr(title) href = _unicodestr(href) if type not in _guide_types: type = "other." + type if title is None: title = 'title missing' thref = urldecodepart(href.split('#')[0]) if thref not in self.href_to_id: raise WrapperException('guide href not in manifest') guide.append((type, title, href)) self.guide = guide self.modified[self.opfbookpath] = 'file'
def addotherfile(self, book_href, data) : id = _unicodestr(book_href) id = urldecodepart(id) if id is None: raise WrapperException('None is not a valid book href') if id in self.other: raise WrapperException('Book href must be unique') desired_path = id.replace("/", os.sep) filepath = os.path.join(self.outdir, desired_path) if os.path.isfile(filepath): raise WrapperException('Desired path already exists') base = os.path.dirname(filepath) if not os.path.exists(base): os.makedirs(base) if isinstance(data, str): data = _utf8str(data) with open(filepath, 'wb')as fp: fp.write(data) self.other.append(id) self.added.append(id) self.book_href_to_filepath[id] = desired_path
def writeotherfile(self, book_href, data): id = _unicodestr(book_href) id = urldecodepart(id) if id is None: raise WrapperException('None is not a valid book href') if id not in self.other and id in self.id_to_href: raise WrapperException('Incorrect interface routine - use writefile') filepath = self.book_href_to_filepath.get(id, None) if filepath is None: raise WrapperException('Book href does not exist') if id in PROTECTED_FILES or id == self.opfbookpath: raise WrapperException('Attempt to modify protected file') filepath = os.path.join(self.outdir, filepath) base = os.path.dirname(filepath) if not os.path.exists(base): os.makedirs(base) if isinstance(data, str): data = _utf8str(data) with open(filepath, 'wb') as fp: fp.write(data) self.modified[id] = 'file'
def map_href_to_id(self, href, ow): href = _unicodestr(href) href = urldecodepart(href) return self.href_to_id.get(href, ow)
def build_bookpath(self, href, starting_dir): href = _unicodestr(href) href = urldecodepart(href) starting_dir = _unicodestr(starting_dir) return buildBookPath(href, starting_dir)
def parse_nav(qp, navdata, navbkpath, newdir): qp.setContent(navdata) toclist = [] pagelist = [] landmarks = [] lvl = 0 pgcnt = 0 maxlvl = -1 nav_type = None href = None title = "" play = 0 navdir = startingDir(navbkpath) for txt, tp, tname, ttype, tattr in qp.parse_iter(): if txt is not None: if ".a." in tp or tp.endswith(".a"): title = title + txt else: title = "" else: if tname == "nav": if ttype == "begin": nav_type = tattr.get("epub:type", None) if ttype == "end": nav_type = None continue if tname == "ol" and nav_type is not None and nav_type in ( "toc", "page-list", "landmarks"): if ttype == "begin": lvl += 1 if nav_type == "toc": if lvl > maxlvl: maxlvl = lvl if ttype == "end": lvl -= 1 continue if tname == "a" and ttype == "begin": # get the raw href (urlencoded) href = tattr.get("href", "") if href.find(":") == -1: # first strip off any fragment fragment = "" if href.find("#") != -1: href, fragment = href.split("#") # find destination bookpath href = urldecodepart(href) fragment = urldecodepart(fragment) if href.startswith("./"): href = href[2:] if href == "": destbkpath = navbkpath else: destbkpath = buildBookPath(href, navdir) # create relative path to destbkpath from newdir href = relativePath(destbkpath, newdir) href = urlencodepart(href) fragment = urlencodepart(fragment) if fragment != "": href = href + "#" + fragment epubtype = tattr.get("epub:type", None) continue if tname == "a" and ttype == "end": if nav_type == "toc": play += 1 toclist.append((play, lvl, href, title)) elif nav_type == "page-list": pgcnt += 1 pagelist.append((pgcnt, href, title)) elif nav_type == "landmarks": if epubtype is not None: gtype = _epubtype_guide_map.get(epubtype, None) landmarks.append((gtype, href, title)) title = "" continue return toclist, pagelist, landmarks, maxlvl, pgcnt