Python QuickXHTMLParser.parse_iter示例，quickparser.QuickXHTMLParser.parse_iter Python示例

示例#1

0

显示文件

文件： navprocessor.py 项目： trlgoz/Sigil

    def setLandmarks(self, landmarks):
        landmarks_xhtml = self.buildLandmarks(landmarks)
        # replace the landmarks from the navsrc with a placeholer
        navsrc = self.content
        qp = QuickXHTMLParser()
        qp.setContent(navsrc)
        nav_type = None
        res = []
        skip_output = False
        for txt, tp, tname, ttype, tattr in qp.parse_iter():
            if txt is not None:
                if not skip_output:
                    res.append(txt)
            else:
                if tname == "nav" and ttype == "begin":
                    nav_type = tattr.get("epub:type", None)
                    if nav_type is not None and nav_type == "landmarks":
                        res.append(SIGIL_REPLACE_LANDMARKS_HERE)
                        skip_output = True
                        continue
                if tname == "nav" and ttype == "end" and nav_type == "landmarks":
                    nav_type = None
                    skip_output = False
                    continue

                if not skip_output:
                    res.append(qp.tag_info_to_xml(tname, ttype, tattr))

        navsrc = "".join(res)
        m = re.search(NAV_LANDMARKS_PATTERN, navsrc)
        if m is None:
            return False
        navsrc = navsrc[0:m.start()] + landmarks_xhtml + navsrc[m.end():]
        self.content = navsrc
        return True

示例#2

0

显示文件

文件： navprocessor.py 项目： trlgoz/Sigil

    def setTOC(self, toclist):
        toc_xhtml = self.buildTOC(toclist)
        # replace the TOC in the current navsrc with a placeholder
        navsrc = self.content
        qp = QuickXHTMLParser()
        qp.setContent(navsrc)
        nav_type = None
        res = []
        skip_output = False
        for txt, tp, tname, ttype, tattr in qp.parse_iter():
            if txt is not None:
                if not skip_output:
                    res.append(txt)
            else:
                if tname == "nav" and ttype == "begin":
                    nav_type = tattr.get("epub:type", None)
                    if nav_type is not None and nav_type == "toc":
                        res.append(SIGIL_REPLACE_TOC_HERE)
                        skip_output = True
                        continue
                if tname == "nav" and ttype == "end" and nav_type == "toc":
                    nav_type = None
                    skip_output = False
                    continue

                if not skip_output:
                    res.append(qp.tag_info_to_xml(tname, ttype, tattr))

        navsrc = "".join(res)
        m = re.search(NAV_TOC_PATTERN, navsrc)
        if m is None:
            return False
        navsrc = navsrc[0:m.start()] + toc_xhtml + navsrc[m.end():]
        self.content = navsrc
        return True

示例#3

0

显示文件

文件： navprocessor.py 项目： trlgoz/Sigil

    def setPageList(self, pagelist):
        pagelist_xhtml = self.buildPageList(pagelist)
        # replace the pagelist from the navsrc with a placeholer
        navsrc = self.content
        qp = QuickXHTMLParser()
        qp.setContent(navsrc)
        nav_type = None
        res = []
        skip_output = False
        found_page_list = False

        for txt, tp, tname, ttype, tattr in qp.parse_iter():
            if txt is not None:
                if not skip_output:
                    res.append(txt)
            else:
                if tname == "nav" and ttype == "begin":
                    nav_type = tattr.get("epub:type", None)
                    if nav_type is not None and nav_type == "page-list":
                        res.append(SIGIL_REPLACE_PAGELIST_HERE)
                        found_page_list = True
                        skip_output = True
                        continue
                if tname == "nav" and ttype == "end" and nav_type == "page-list":
                    nav_type = None
                    skip_output = False
                    continue
                if tname == "body" and ttype == "end":
                    if not found_page_list and len(pagelist) > 0:
                        padding = res[-1]
                        res.append(SIGIL_REPLACE_PAGELIST_HERE)
                        res.append(padding)
                        found_page_list = True

                if not skip_output:
                    res.append(qp.tag_info_to_xml(tname, ttype, tattr))

        navsrc = "".join(res)
        m = re.search(NAV_PAGELIST_PATTERN, navsrc)
        if m is None:
            return False
        navsrc = navsrc[0:m.start()] + pagelist_xhtml + navsrc[m.end():]
        self.content = navsrc
        return True

示例#4

0

显示文件

    def getTOC(self):
        # parse the nav to get the table of contents
        navsrc = self.content
        toclist = []

        qp = QuickXHTMLParser()
        qp.setContent(navsrc)
        lvl = 0
        po = 0
        title = ""
        nav_type = None
        href = None
        for txt, tp, tname, ttype, tattr in qp.parse_iter():
            if txt is not None:
                if ".a." in tp or tp.endswith(".a"):
                    title = title + txt
                else:
                    title = ""
            else:
                if tname == "nav" and ttype == "begin":
                    nav_type = tattr.get("epub:type", None)
                    continue
                if tname == "nav" and ttype == "end":
                    nav_type = None
                    continue
                if nav_type is not None and nav_type == "toc":
                    if tname == "ol":
                        if ttype == "begin": lvl += 1
                        if ttype == "end": lvl -= 1
                        continue
                    if tname == "a" and ttype == "begin":
                        href = tattr.get("href", "")
                        # must leave all url hrefs in raw url encoded form
                        # if they can ever contain fragments
                        continue
                    if tname == "a" and ttype == "end":
                        po += 1
                        title = xmldecode(title)
                        toclist.append((po, lvl, href, title))
                        title = ""
                        href = None
                        continue

        return toclist

示例#5

0

显示文件

    def getLandmarks(self):
        # parse the nav to get the landmarks
        navsrc = self.content
        landmarks = []

        qp = QuickXHTMLParser()
        qp.setContent(navsrc)
        title = ""
        nav_type = None
        href = None
        epubtype = None
        for txt, tp, tname, ttype, tattr in qp.parse_iter():
            if txt is not None:
                if ".a." in tp or tp.endswith(".a"):
                    title = title + txt
                else:
                    title = ""
            else:
                if tname == "nav" and ttype == "begin":
                    nav_type = tattr.get("epub:type", None)
                    continue
                if tname == "nav" and ttype == "end":
                    nav_type = None
                    continue

                if nav_type is not None and nav_type == "landmarks":
                    if tname == "a" and ttype == "begin":
                        href = tattr.get("href", "")
                        # must leave all hrefs in raw url encoded form
                        # if they can contain fragments
                        epubtype = tattr.get("epub:type", None)
                        continue
                    if tname == "a" and ttype == "end":
                        if epubtype is not None:
                            title = xmldecode(title)
                            landmarks.append((epubtype, href, title))
                        title = ""
                        epubtype = None
                        href = None
                        continue
        return landmarks

示例#6

0

显示文件

文件： navprocessor.py 项目： trlgoz/Sigil

    def getPageList(self):
        # parse the nav source to get the page-list
        navsrc = self.content
        pagelist = []

        qp = QuickXHTMLParser()
        qp.setContent(navsrc)
        pgcnt = 0
        nav_type = None
        href = None
        title = ""
        for txt, tp, tname, ttype, tattr in qp.parse_iter():
            if txt is not None:
                if ".a." in tp or tp.endswith(".a"):
                    title = title + txt
                else:
                    title = ""
            else:
                if tname == "nav" and ttype == "begin":
                    nav_type = tattr.get("epub:type", None)
                    continue
                if tname == "nav" and ttype == "end":
                    nav_type = None
                    continue
                if nav_type is not None and nav_type == "page-list":
                    if tname == "a" and ttype == "begin" and nav_type == "page-list":
                        href = tattr.get("href", "")
                        href = unquoteurl(href)
                        continue
                    if tname == "a" and ttype == "end":
                        pgcnt += 1
                        title = xmldecode(title)
                        pagelist.append((pgcnt, href, title))
                        title = ""
                        continue

        return pagelist