Python parse_opf 예제들, opf.parse_opf Python 예제들

예제 #1

0

파일 보기

파일: __init__.py 프로젝트: dorbian/LazyLibrarian

    def _init_read(self):
        """Get content from existing epub file"""
        # Read container.xml to get OPF xml file path
        xmlstring = self.read('META-INF/container.xml')
        container_xml = minidom.parseString(xmlstring).documentElement

        for element in container_xml.getElementsByTagName('rootfile'):
            if element.getAttribute('media-type') == MIMETYPE_OPF:
                # Only take the first full-path available
                self.opf_path = element.getAttribute('full-path')
                break

        # Read OPF xml file
        xml_string = self.read(self.opf_path)
        self.opf = opf.parse_opf(xml_string)
        uids = [x for x in self.opf.metadata.identifiers
                      if x[1] == self.opf.uid_id]
        if uids:
            self.uid = uids[0]
        else:
            self.uid = None
            warnings.warn('The ePub does not define any uid', SyntaxWarning)

        item_toc = self.get_item(self.opf.spine.toc)

        # Inspect NCX toc file
        self.toc = None
        if item_toc is not None:
            self.toc = ncx.parse_toc(self.read_item(item_toc))
        else:
            warnings.warn('The ePub does not define any NCX file',
                          SyntaxWarning)
            self.toc = ncx.Ncx()
            self.toc.uid = self.uid

예제 #2

0

파일 보기

파일: fonctions.py 프로젝트: vdfebook/PersonnaLiseur

def maj_meta(monepub, metaname, metac, metacal, copyr):
    os.chdir(monepub.chemin)
    job.log('Met à jour les métadonnées')
    mesmeta = monepub.opf.as_xml_document()
    if metaname :
        job.log('\t il existe un metadata.opf')
        calibre_meta = opf.parse_opf(os.path.normpath(metaname))
        calibrexml = calibre_meta.as_xml_document()
        metaxml = modifie_metas(calibrexml, metac, metacal)
    else :
        job.log('\t on utilise les metas internes')
        metaxml = modifie_metas(mesmeta, metac, metacal)

    for node in mesmeta.getElementsByTagName('package'):
        metas = node.firstChild
        package = metas.parentNode
        monmanifest = metas.nextSibling
        package.removeChild(metas)
        package.insertBefore(metaxml, monmanifest)
        path = os.path.normpath(os.path.join(monepub.chemin, monepub.opf_path))
        f = open(path, mode='w', encoding='utf8', errors='strict')
        f.write(mesmeta.toprettyxml())
        f.close()
        monepub.init_read()
        metas_xhtml(monepub, copyr)
    return

예제 #3

0

파일 보기

파일: fonctions.py 프로젝트: vdfebook/PersonnaLiseur

def maj_publication(monepub, copyright):
    from PyQt4 import QtGui
    try:
        job.ProgressDialog.setLabelText('Récupération de la date de copyright')
        QtGui.QApplication.processEvents()
    except:
        pass
    job.log('Récupération du copyright')
    pubdate = sorted(copyright)[0] + '-01-01'
    file = os.path.normpath(monepub.chemin + '\\' + monepub.opf_path)
    parser = etree.XMLParser(attribute_defaults=False, load_dtd=False, remove_comments=False, ns_clean=True)
    tree = etree.parse(file, parser)
    xpdate = xpath(tree, '//dc:date')
    for child in xpdate :
        if child.get("{http://www.idpf.org/2007/opf}event") == 'publication' :
            child.text = pubdate
        elif child.get("{http://www.idpf.org/2007/opf}event") is None:
            child.text = pubdate
    texte = etree.tostring(tree, encoding='utf-8', xml_declaration=True, pretty_print=True)
    with open(file, mode='wb') as f:
        f.write(texte)
    opf.parse_opf(file)
    return

예제 #4

0

파일 보기

파일: container.py 프로젝트: vdfebook/PersonnaLiseur

    def init_read(self):
        # Read container.xml to get OPF xml file path
        xmlstring = 'META-INF/container.xml'
        container_xml = minidom.parse(xmlstring).documentElement

        for e in container_xml.getElementsByTagName('rootfile'):
            if e.getAttribute('media-type') == MIMETYPE_OPF:
                # Only take the first full-path available
                self.opf_path = e.getAttribute('full-path')  # OEBPS\content.opf ou content.opf
                if os.path.isfile(os.path.join(self.chemin, self.opf_path)):
                    self.content_path = os.path.dirname(self.opf_path)  # OEBPS ou ''
                    break
                else :
                    self.opf_path = self.cherche_opf(self.chemin)
                    if self.opf_path:
                        self.content_path = os.path.dirname(self.opf_path)  # OEBPS ou ''
                    else:
                        job.log('Epub invalide : pas de content.opf')
                        break

        # Read OPF xml file
        xml_string = (self.opf_path)
        self.opf = opf.parse_opf(xml_string)
        try:
            self.uid = [x for x in self.opf.metadata.identifiers
                      if x[1] == self.opf.uid_id][0]
        except:
            self.uid = None
        self.item_toc = self.get_item(self.opf.spine.toc)

        if self.item_toc is None:
            for identifier in self.opf.manifest:
                item = self.opf.manifest[identifier]
                if 'ncx' in item.href :
                    self.item_toc = os.path.join(self.content_path, item.href)
        if  self.opf.metadata.creators:
            self.auteur = self.opf.metadata.creators[0][0]
        else :
            self.auteur = ''
        if self.opf.metadata.titles :
            self.titre = self.opf.metadata.titles[0][0]
        else:
            self.titre = ''
        # Get the css files (on en profite pour implémenter la liste des fichiers)
        self.css = []
        self.liste_items = []
        self.liste_fichiers = []
        for identifier in self.opf.manifest:
            item = self.opf.manifest[identifier]
            self.liste_items.append(item.href)
            self.liste_fichiers.append(self.normalize(self.relat_to_abs(item.href)))
            if 'css' in item.href :
                self.css.append(os.path.join(self.content_path, item.href))  # OEBPS\Style\styles.css ou stylesheet.css
        # Inspect NCX toc file
        try:
            self.toc = ncx.parse_toc(self.read_item(self.item_toc))
        except (UnicodeError, IOError) :
            self.toc = None
        # recherche la page et l'image de couverture
        if self.pagedecouv is None:
            self.set_cover()

예제 #5

0

파일 보기

파일: tatoo.py 프로젝트: vdfebook/PersonnaLiseur

def modif_metas_tatoo(monepub):
    # crée un nouvel identifiant uuid
    monepub.uid = 'urn:uuid:' + '%s' % uuid.uuid4()
    parser = etree.XMLParser(attribute_defaults=False, load_dtd=False, remove_comments=False, ns_clean=True)  # , recover=True)

    # traitement de toc.ncx
    mytoc = os.path.normpath(monepub.chemin + '\\' + monepub.content_path + '\\' + monepub.item_toc.href)
    tree = etree.parse(mytoc, parser)
    # on remplace l'identifiant par le nouveau
    meta_uid = XPath('//dt:meta[@name="dtb:uid"]')
    for key, value in enumerate(meta_uid(tree)):
        value.set('content', monepub.uid)
    texte = etree.tostring(tree, encoding='utf-8', xml_declaration=True, pretty_print=True)
    with open(mytoc, mode='wb') as f:
        f.write(texte)

    # traitement du .opf
    myopf = os.path.join(monepub.chemin, monepub.opf_path)
    tree = etree.parse(myopf, parser)
    # on récupère l'id de l'identifiant unique
    pack = XPath('//opf:package')
    unique = str(pack(tree)[0].get('unique-identifier'))
    # on remplace l'identifiant par le nouveau
    ident = XPath('//dc:identifier[@id="{}"]'.format(unique))
    if len(ident(tree)):
        ident(tree)[0].text = monepub.uid
        # puis on change l'id et le opf:scheme
        ident(tree)[0].set('id', 'BookId')
    pack(tree)[0].set('unique-identifier', 'BookId')
    job.logtat('remplacement de l\'identifiant unique par : ' + monepub.uid)
    ident_scheme = xpath(tree, '//dc:identifier')
    for child in ident_scheme:
        child.set("{http://www.idpf.org/2007/opf}scheme", 'uuid')
    # on supprime les autres identifiants
    autres_ident = XPath('//dc:identifier')
    for id in autres_ident(tree):
        if id.get('id') != 'BookId':
            if (id.get('id') == 'ean') :  # or (id.get('opf:scheme') in ('ISBN', 'isbn')) :
                id.text = id.text[0:13]
            else :
                job.logtat('suppression de l\'identifiant : "' + id.text + '"')
                id.getparent().remove(id)
    # supprimer les dc:rights
    rights = XPath('//dc:rights')
    for right in rights(tree):
        if right.text is not None:
            job.logtat('suppression du dc:rights : "' + str(right.text) + '" ')
            right.getparent().remove(right)
    # supprimer les description contenant code et date de la version
    desc = XPath('//dc:description')
    for com in desc(tree):
        if re.search('Version ([0-9]{1,6})', str(com.text)):
            job.logtat('suppression de la description : "' + str(com.text) + '"')
            com.text = str(com.text).replace(str(com.text), '')
    # supprimer les commentaires
    for elem in tree.iter(tag=etree.Comment):
        if elem.text:
            job.logtat('suppression des commentaires dans le opf : "' + str(elem.text) + '"')
            elem.getparent().remove(elem)
    # réécrire le fichier
    texte = etree.tostring(tree, encoding='utf-8', xml_declaration=True, pretty_print=True)
    with open(myopf, mode='wb') as f:
        f.write(texte)
    opf.parse_opf(myopf)
    return