def _init_read(self): """Get content from existing epub file""" # Read container.xml to get OPF xml file path xmlstring = self.read('META-INF/container.xml') container_xml = minidom.parseString(xmlstring).documentElement for element in container_xml.getElementsByTagName('rootfile'): if element.getAttribute('media-type') == MIMETYPE_OPF: # Only take the first full-path available self.opf_path = element.getAttribute('full-path') break # Read OPF xml file xml_string = self.read(self.opf_path) self.opf = opf.parse_opf(xml_string) uids = [x for x in self.opf.metadata.identifiers if x[1] == self.opf.uid_id] if uids: self.uid = uids[0] else: self.uid = None warnings.warn('The ePub does not define any uid', SyntaxWarning) item_toc = self.get_item(self.opf.spine.toc) # Inspect NCX toc file self.toc = None if item_toc is not None: self.toc = ncx.parse_toc(self.read_item(item_toc)) else: warnings.warn('The ePub does not define any NCX file', SyntaxWarning) self.toc = ncx.Ncx() self.toc.uid = self.uid
def maj_meta(monepub, metaname, metac, metacal, copyr): os.chdir(monepub.chemin) job.log('Met à jour les métadonnées') mesmeta = monepub.opf.as_xml_document() if metaname : job.log('\t il existe un metadata.opf') calibre_meta = opf.parse_opf(os.path.normpath(metaname)) calibrexml = calibre_meta.as_xml_document() metaxml = modifie_metas(calibrexml, metac, metacal) else : job.log('\t on utilise les metas internes') metaxml = modifie_metas(mesmeta, metac, metacal) for node in mesmeta.getElementsByTagName('package'): metas = node.firstChild package = metas.parentNode monmanifest = metas.nextSibling package.removeChild(metas) package.insertBefore(metaxml, monmanifest) path = os.path.normpath(os.path.join(monepub.chemin, monepub.opf_path)) f = open(path, mode='w', encoding='utf8', errors='strict') f.write(mesmeta.toprettyxml()) f.close() monepub.init_read() metas_xhtml(monepub, copyr) return
def maj_publication(monepub, copyright): from PyQt4 import QtGui try: job.ProgressDialog.setLabelText('Récupération de la date de copyright') QtGui.QApplication.processEvents() except: pass job.log('Récupération du copyright') pubdate = sorted(copyright)[0] + '-01-01' file = os.path.normpath(monepub.chemin + '\\' + monepub.opf_path) parser = etree.XMLParser(attribute_defaults=False, load_dtd=False, remove_comments=False, ns_clean=True) tree = etree.parse(file, parser) xpdate = xpath(tree, '//dc:date') for child in xpdate : if child.get("{http://www.idpf.org/2007/opf}event") == 'publication' : child.text = pubdate elif child.get("{http://www.idpf.org/2007/opf}event") is None: child.text = pubdate texte = etree.tostring(tree, encoding='utf-8', xml_declaration=True, pretty_print=True) with open(file, mode='wb') as f: f.write(texte) opf.parse_opf(file) return
def init_read(self): # Read container.xml to get OPF xml file path xmlstring = 'META-INF/container.xml' container_xml = minidom.parse(xmlstring).documentElement for e in container_xml.getElementsByTagName('rootfile'): if e.getAttribute('media-type') == MIMETYPE_OPF: # Only take the first full-path available self.opf_path = e.getAttribute('full-path') # OEBPS\content.opf ou content.opf if os.path.isfile(os.path.join(self.chemin, self.opf_path)): self.content_path = os.path.dirname(self.opf_path) # OEBPS ou '' break else : self.opf_path = self.cherche_opf(self.chemin) if self.opf_path: self.content_path = os.path.dirname(self.opf_path) # OEBPS ou '' else: job.log('Epub invalide : pas de content.opf') break # Read OPF xml file xml_string = (self.opf_path) self.opf = opf.parse_opf(xml_string) try: self.uid = [x for x in self.opf.metadata.identifiers if x[1] == self.opf.uid_id][0] except: self.uid = None self.item_toc = self.get_item(self.opf.spine.toc) if self.item_toc is None: for identifier in self.opf.manifest: item = self.opf.manifest[identifier] if 'ncx' in item.href : self.item_toc = os.path.join(self.content_path, item.href) if self.opf.metadata.creators: self.auteur = self.opf.metadata.creators[0][0] else : self.auteur = '' if self.opf.metadata.titles : self.titre = self.opf.metadata.titles[0][0] else: self.titre = '' # Get the css files (on en profite pour implémenter la liste des fichiers) self.css = [] self.liste_items = [] self.liste_fichiers = [] for identifier in self.opf.manifest: item = self.opf.manifest[identifier] self.liste_items.append(item.href) self.liste_fichiers.append(self.normalize(self.relat_to_abs(item.href))) if 'css' in item.href : self.css.append(os.path.join(self.content_path, item.href)) # OEBPS\Style\styles.css ou stylesheet.css # Inspect NCX toc file try: self.toc = ncx.parse_toc(self.read_item(self.item_toc)) except (UnicodeError, IOError) : self.toc = None # recherche la page et l'image de couverture if self.pagedecouv is None: self.set_cover()
def modif_metas_tatoo(monepub): # crée un nouvel identifiant uuid monepub.uid = 'urn:uuid:' + '%s' % uuid.uuid4() parser = etree.XMLParser(attribute_defaults=False, load_dtd=False, remove_comments=False, ns_clean=True) # , recover=True) # traitement de toc.ncx mytoc = os.path.normpath(monepub.chemin + '\\' + monepub.content_path + '\\' + monepub.item_toc.href) tree = etree.parse(mytoc, parser) # on remplace l'identifiant par le nouveau meta_uid = XPath('//dt:meta[@name="dtb:uid"]') for key, value in enumerate(meta_uid(tree)): value.set('content', monepub.uid) texte = etree.tostring(tree, encoding='utf-8', xml_declaration=True, pretty_print=True) with open(mytoc, mode='wb') as f: f.write(texte) # traitement du .opf myopf = os.path.join(monepub.chemin, monepub.opf_path) tree = etree.parse(myopf, parser) # on récupère l'id de l'identifiant unique pack = XPath('//opf:package') unique = str(pack(tree)[0].get('unique-identifier')) # on remplace l'identifiant par le nouveau ident = XPath('//dc:identifier[@id="{}"]'.format(unique)) if len(ident(tree)): ident(tree)[0].text = monepub.uid # puis on change l'id et le opf:scheme ident(tree)[0].set('id', 'BookId') pack(tree)[0].set('unique-identifier', 'BookId') job.logtat('remplacement de l\'identifiant unique par : ' + monepub.uid) ident_scheme = xpath(tree, '//dc:identifier') for child in ident_scheme: child.set("{http://www.idpf.org/2007/opf}scheme", 'uuid') # on supprime les autres identifiants autres_ident = XPath('//dc:identifier') for id in autres_ident(tree): if id.get('id') != 'BookId': if (id.get('id') == 'ean') : # or (id.get('opf:scheme') in ('ISBN', 'isbn')) : id.text = id.text[0:13] else : job.logtat('suppression de l\'identifiant : "' + id.text + '"') id.getparent().remove(id) # supprimer les dc:rights rights = XPath('//dc:rights') for right in rights(tree): if right.text is not None: job.logtat('suppression du dc:rights : "' + str(right.text) + '" ') right.getparent().remove(right) # supprimer les description contenant code et date de la version desc = XPath('//dc:description') for com in desc(tree): if re.search('Version ([0-9]{1,6})', str(com.text)): job.logtat('suppression de la description : "' + str(com.text) + '"') com.text = str(com.text).replace(str(com.text), '') # supprimer les commentaires for elem in tree.iter(tag=etree.Comment): if elem.text: job.logtat('suppression des commentaires dans le opf : "' + str(elem.text) + '"') elem.getparent().remove(elem) # réécrire le fichier texte = etree.tostring(tree, encoding='utf-8', xml_declaration=True, pretty_print=True) with open(myopf, mode='wb') as f: f.write(texte) opf.parse_opf(myopf) return