Python QuickXHTMLParser 예제들, quickparser.QuickXHTMLParser Python 예제들

예제 #1

0

파일 보기

파일: inputcontainer.py 프로젝트: zjj00520/Sigil

 def __init__(self, wrapper, debug=False):
     self._debug = debug
     self._w = wrapper
     self.qp = QuickXHTMLParser()
     self.hspell = HunspellChecker(wrapper.get_hunspell_path())
     self.dictionary_dirs = wrapper.get_dictionary_dirs()
     self._prefs_store = JSONPrefs(wrapper.plugin_dir, wrapper.plugin_name)

예제 #2

0

파일 보기

파일: navgenerator.py 프로젝트: robyscar/sigil

def generateNav(ebook_root, navtitle):
    opfdata = ""
    opf_path = os.path.join(ebook_root, 'OEBPS', 'content.opf')
    has_error = False
    try:
        with open(opf_path, 'rb') as f:
            opfdata = f.read()
            opfdata = opfdata.decode('utf-8', errors='replace')
    except:
        has_error = True
        pass

    if has_error:
        return ""

    # now parse opf
    opfparser = OPFParser(opfdata)
    guide_info = opfparser.get_guide()
    lang = opfparser.get_lang()
    id2href = opfparser.get_id2hrefmap()
    ncxpath = opfparser.get_ncxpath()
    if ncxpath is None:
        return ""
    ncxpath = os.path.abspath(os.path.join(ebook_root, "OEBPS", ncxpath))
    if not os.path.exists(ncxpath):
        return ""

    # It is possible that the original <guide> contains references
    # to files not in the spine. Putting those "dangling" references
    # in the EPUB3 navigation document will result in validation error:
    # RSC-011 "Found a reference to a resource that is not a spine item.".
    # We must check that the referenced files are listed in the spine.

    # First generate a list of hrefs in the spine
    spinelst = opfparser.get_spine()
    spine_hrefs = []
    for (idref, attr) in spinelst:
        spine_hrefs.append(id2href[idref])

    # reduce guide to those references found to the spine
    guide_info_in_spine = []
    for gtyp, gtitle, ghref in guide_info:
        ahref = ghref.split('#')[0]
        if ahref in spine_hrefs:
            guide_info_in_spine.append((gtyp, gtitle, ghref))

    # need to take info from guide tag in opf and toc.ncx to create a valid nav.xhtml
    try:
        qp = QuickXHTMLParser()
        doctitle, toclist, pagelist = parse_ncx(qp, ncxpath)
        navdata = build_nav(doctitle, toclist, pagelist, guide_info_in_spine,
                            lang, navtitle)
    except:
        has_error = True
        pass
    if has_error:
        return ""
    return navdata

예제 #3

0

파일 보기

    def getTOC(self):
        # parse the nav to get the table of contents
        navsrc = self.content
        toclist = []

        qp = QuickXHTMLParser()
        qp.setContent(navsrc)
        lvl = 0
        po = 0
        title = ""
        nav_type = None
        href = None
        for txt, tp, tname, ttype, tattr in qp.parse_iter():
            if txt is not None:
                if ".a." in tp or tp.endswith(".a"):
                    title = title + txt
                else:
                    title = ""
            else:
                if tname == "nav" and ttype == "begin":
                    nav_type = tattr.get("epub:type", None)
                    continue
                if tname == "nav" and ttype == "end":
                    nav_type = None
                    continue
                if nav_type is not None and nav_type == "toc":
                    if tname == "ol":
                        if ttype == "begin": lvl += 1
                        if ttype == "end": lvl -= 1
                        continue
                    if tname == "a" and ttype == "begin":
                        href = tattr.get("href", "")
                        # must leave all url hrefs in raw url encoded form
                        # if they can ever contain fragments
                        continue
                    if tname == "a" and ttype == "end":
                        po += 1
                        title = xmldecode(title)
                        toclist.append((po, lvl, href, title))
                        title = ""
                        href = None
                        continue

        return toclist

예제 #4

0

파일 보기

def generateGuideEntries(navdata, navbkpath, opfdir):
     has_error = False
     try:
         qp = QuickXHTMLParser()
         toclist, pagelist, landmarks, maxlvl, pgcnt = parse_nav(qp, navdata, navbkpath, opfdir)
     except:
         has_error = True
         pass
     if has_error:
         return [("","","")]
     return landmarks

예제 #5

0

파일 보기

    def getLandmarks(self):
        # parse the nav to get the landmarks
        navsrc = self.content
        landmarks = []

        qp = QuickXHTMLParser()
        qp.setContent(navsrc)
        title = ""
        nav_type = None
        href = None
        epubtype = None
        for txt, tp, tname, ttype, tattr in qp.parse_iter():
            if txt is not None:
                if ".a." in tp or tp.endswith(".a"):
                    title = title + txt
                else:
                    title = ""
            else:
                if tname == "nav" and ttype == "begin":
                    nav_type = tattr.get("epub:type", None)
                    continue
                if tname == "nav" and ttype == "end":
                    nav_type = None
                    continue

                if nav_type is not None and nav_type == "landmarks":
                    if tname == "a" and ttype == "begin":
                        href = tattr.get("href", "")
                        # must leave all hrefs in raw url encoded form
                        # if they can contain fragments
                        epubtype = tattr.get("epub:type", None)
                        continue
                    if tname == "a" and ttype == "end":
                        if epubtype is not None:
                            title = xmldecode(title)
                            landmarks.append((epubtype, href, title))
                        title = ""
                        epubtype = None
                        href = None
                        continue
        return landmarks

예제 #6

0

파일 보기

파일: navprocessor.py 프로젝트: trlgoz/Sigil

    def setLandmarks(self, landmarks):
        landmarks_xhtml = self.buildLandmarks(landmarks)
        # replace the landmarks from the navsrc with a placeholer
        navsrc = self.content
        qp = QuickXHTMLParser()
        qp.setContent(navsrc)
        nav_type = None
        res = []
        skip_output = False
        for txt, tp, tname, ttype, tattr in qp.parse_iter():
            if txt is not None:
                if not skip_output:
                    res.append(txt)
            else:
                if tname == "nav" and ttype == "begin":
                    nav_type = tattr.get("epub:type", None)
                    if nav_type is not None and nav_type == "landmarks":
                        res.append(SIGIL_REPLACE_LANDMARKS_HERE)
                        skip_output = True
                        continue
                if tname == "nav" and ttype == "end" and nav_type == "landmarks":
                    nav_type = None
                    skip_output = False
                    continue

                if not skip_output:
                    res.append(qp.tag_info_to_xml(tname, ttype, tattr))

        navsrc = "".join(res)
        m = re.search(NAV_LANDMARKS_PATTERN, navsrc)
        if m is None:
            return False
        navsrc = navsrc[0:m.start()] + landmarks_xhtml + navsrc[m.end():]
        self.content = navsrc
        return True

예제 #7

0

파일 보기

파일: navprocessor.py 프로젝트: trlgoz/Sigil

    def setTOC(self, toclist):
        toc_xhtml = self.buildTOC(toclist)
        # replace the TOC in the current navsrc with a placeholder
        navsrc = self.content
        qp = QuickXHTMLParser()
        qp.setContent(navsrc)
        nav_type = None
        res = []
        skip_output = False
        for txt, tp, tname, ttype, tattr in qp.parse_iter():
            if txt is not None:
                if not skip_output:
                    res.append(txt)
            else:
                if tname == "nav" and ttype == "begin":
                    nav_type = tattr.get("epub:type", None)
                    if nav_type is not None and nav_type == "toc":
                        res.append(SIGIL_REPLACE_TOC_HERE)
                        skip_output = True
                        continue
                if tname == "nav" and ttype == "end" and nav_type == "toc":
                    nav_type = None
                    skip_output = False
                    continue

                if not skip_output:
                    res.append(qp.tag_info_to_xml(tname, ttype, tattr))

        navsrc = "".join(res)
        m = re.search(NAV_TOC_PATTERN, navsrc)
        if m is None:
            return False
        navsrc = navsrc[0:m.start()] + toc_xhtml + navsrc[m.end():]
        self.content = navsrc
        return True

예제 #8

0

파일 보기

파일: ncxgenerator.py 프로젝트: yiqideren/Sigil

def generateNCX(navdata, navname, doctitle, mainid):
    has_error = False
    if mainid.startswith("urn:uuid:"): mainid = mainid[9:]
    # try:
    qp = QuickXHTMLParser()
    toclist, pagelist, landmarks, maxlvl, pgcnt = parse_nav(
        qp, navdata, navname)
    ncxdata = build_ncx(doctitle, mainid, maxlvl, pgcnt, toclist, pagelist)
    # except:
    #     has_error = True
    #     pass
    # if has_error:
    #     return ""
    return ncxdata

예제 #9

0

파일 보기

def generateNCX(navdata, navbkpath, ncxdir, doctitle, mainid):
    has_error = False
    # main id must exactly match used in the opf
    # if mainid.startswith("urn:uuid:"): mainid = mainid[9:]
    # try:
    qp = QuickXHTMLParser()
    toclist, pagelist, landmarks, maxlvl, pgcnt = parse_nav(qp, navdata, navbkpath, ncxdir)
    ncxdata = build_ncx(doctitle, mainid, maxlvl, pgcnt, toclist, pagelist)
    # except:
    #     has_error = True
    #     pass
    # if has_error:
    #     return ""
    return ncxdata

예제 #10

0

파일 보기

파일: navprocessor.py 프로젝트: trlgoz/Sigil

    def getPageList(self):
        # parse the nav source to get the page-list
        navsrc = self.content
        pagelist = []

        qp = QuickXHTMLParser()
        qp.setContent(navsrc)
        pgcnt = 0
        nav_type = None
        href = None
        title = ""
        for txt, tp, tname, ttype, tattr in qp.parse_iter():
            if txt is not None:
                if ".a." in tp or tp.endswith(".a"):
                    title = title + txt
                else:
                    title = ""
            else:
                if tname == "nav" and ttype == "begin":
                    nav_type = tattr.get("epub:type", None)
                    continue
                if tname == "nav" and ttype == "end":
                    nav_type = None
                    continue
                if nav_type is not None and nav_type == "page-list":
                    if tname == "a" and ttype == "begin" and nav_type == "page-list":
                        href = tattr.get("href", "")
                        href = unquoteurl(href)
                        continue
                    if tname == "a" and ttype == "end":
                        pgcnt += 1
                        title = xmldecode(title)
                        pagelist.append((pgcnt, href, title))
                        title = ""
                        continue

        return pagelist

예제 #11

0

파일 보기

파일: navgenerator.py 프로젝트: PippaCarron/Sigil

def generateNav(opfdata, ncxdata, navtitle):
    # now parse opf
    opfparser = OPFParser(opfdata)
    guide_info = opfparser.get_guide()
    lang = opfparser.get_lang()
    id2href = opfparser.get_id2hrefmap()

    # It is possible that the original <guide> contains references
    # to files not in the spine. Putting those "dangling" references
    # in the EPUB3 navigation document will result in validation error:
    # RSC-011 "Found a reference to a resource that is not a spine item.".
    # We must check that the referenced files are listed in the spine.

    # First generate a list of hrefs in the spine
    spinelst = opfparser.get_spine()
    spine_hrefs = []
    for (idref, attr) in spinelst:
        spine_hrefs.append(id2href[idref])

    # reduce guide to those references found to the spine
    guide_info_in_spine = []
    for gtyp, gtitle, ghref in guide_info:
        ahref = ghref.split('#')[0]
        if ahref in spine_hrefs:
            guide_info_in_spine.append((gtyp, gtitle, ghref))

    # need to take info from guide tag in opf and toc.ncx to create a valid nav.xhtml
    has_error = False
    try:
        qp = QuickXHTMLParser()
        doctitle, toclist, pagelist = parse_ncx(qp, ncxdata)
        navdata = build_nav(doctitle, toclist, pagelist, guide_info_in_spine,
                            lang, navtitle)
    except:
        has_error = True
        pass
    if has_error:
        return ""
    return navdata

예제 #12

0

파일 보기

파일: navprocessor.py 프로젝트: trlgoz/Sigil

    def setPageList(self, pagelist):
        pagelist_xhtml = self.buildPageList(pagelist)
        # replace the pagelist from the navsrc with a placeholer
        navsrc = self.content
        qp = QuickXHTMLParser()
        qp.setContent(navsrc)
        nav_type = None
        res = []
        skip_output = False
        found_page_list = False

        for txt, tp, tname, ttype, tattr in qp.parse_iter():
            if txt is not None:
                if not skip_output:
                    res.append(txt)
            else:
                if tname == "nav" and ttype == "begin":
                    nav_type = tattr.get("epub:type", None)
                    if nav_type is not None and nav_type == "page-list":
                        res.append(SIGIL_REPLACE_PAGELIST_HERE)
                        found_page_list = True
                        skip_output = True
                        continue
                if tname == "nav" and ttype == "end" and nav_type == "page-list":
                    nav_type = None
                    skip_output = False
                    continue
                if tname == "body" and ttype == "end":
                    if not found_page_list and len(pagelist) > 0:
                        padding = res[-1]
                        res.append(SIGIL_REPLACE_PAGELIST_HERE)
                        res.append(padding)
                        found_page_list = True

                if not skip_output:
                    res.append(qp.tag_info_to_xml(tname, ttype, tattr))

        navsrc = "".join(res)
        m = re.search(NAV_PAGELIST_PATTERN, navsrc)
        if m is None:
            return False
        navsrc = navsrc[0:m.start()] + pagelist_xhtml + navsrc[m.end():]
        self.content = navsrc
        return True

예제 #13

0

파일 보기

파일: inputcontainer.py 프로젝트: sakishum/Sigil

 def __init__(self, wrapper, debug=False):
     self._debug = debug
     self._w = wrapper
     self.qp = QuickXHTMLParser()

예제 #14

0

파일 보기

 def __init__(self, wrapper, debug=False):
     self._debug = debug
     self._w = wrapper
     self.qp = QuickXHTMLParser()
     self._prefs_store = JSONPrefs(wrapper.plugin_dir, wrapper.plugin_name)