示例#1
0
def add_landmark(dom: EasyXmlTree, textf: str, landmarks: list) -> None:
    """
	Adds an item to landmark list with appropriate details.

	INPUTS:
	dom: EasyXmlTree representation of the file we are indexing in ToC
	textf: path to the file
	landmarks: the list of landmark items we are building

	OUTPUTS:
	None
	"""

    epub_type = ""
    sections = dom.xpath("//body/*[name() = 'section' or name() = 'article']")
    if not sections:
        raise se.InvalidInputException(
            "Couldn’t locate first [xhtml]<section>[/] or [xhtml]<article>[/]."
        )
    epub_type = sections[0].get_attr("epub:type")
    bodys = dom.xpath("//body")
    if not bodys:
        raise se.InvalidInputException("Couldn’t locate [xhtml]<body>[/].")

    if not epub_type:  # some productions don't have an epub:type in outermost section, so get it from body tag
        epub_type = bodys[0].get_attr("epub:type")
        if not epub_type:
            epub_type = ""

    if epub_type in ["frontmatter", "bodymatter", "backmatter"]:
        return  # if epub_type is ONLY frontmatter, bodymatter, backmatter, we don't want this as a landmark

    # We may wind up with a (front|body|back)matter semantic in epub_type, remove it here since we add it to the landmark later
    epub_type = regex.sub(r"(front|body|back)matter\s*", "", epub_type)

    landmark = TocItem()
    if epub_type:
        landmark.epub_type = epub_type
        landmark.file_link = textf
        landmark.place = get_place(bodys[0])
        if epub_type == "halftitlepage":
            landmark.title = "Half Title"
        else:
            landmark.title = dom.xpath(
                "//head/title/text()",
                True)  # Use the page title as the landmark entry title.
            if landmark.title is None:
                # This is a bit desperate, use this only if there's no proper <title> tag in file.
                landmark.title = landmark.epub_type.capitalize()
        landmarks.append(landmark)
def process_headings(dom: EasyXmlTree, textf: str, toc_list: list,
                     nest_under_halftitle: bool, single_file: bool):
    """
	Find headings in current file and extract title data
	into items added to toc_list.

	INPUTS:
	dom: an EasyXmlTree representation of the current file
	textf: the path to the file
	toc_list: the list of ToC items we are building
	nest_under_halftitle: does this item need to be nested?
	single_file: is there only a single content item in the production?

	OUTPUTS:
	None
	"""

    body = dom.xpath("//body")
    place = Position.NONE
    if body:
        place = get_place(body[0])
    else:
        raise se.InvalidInputException("Couldn't locate body node")

    is_toplevel = True

    # Find all the hgroups and h1, h2 etc headings.
    heads = dom.xpath("//hgroup | //h1 | //h2 | //h3 | //h4 | //h5 | //h6")

    # special treatment where we can't find any header or hgroups
    if not heads:  # May be a dedication or an epigraph, with no heading tag.
        if single_file and nest_under_halftitle:
            # There's a halftitle, but only this one content file with no subsections,
            # so leave out of ToC because the Toc will link to the halftitle.
            return
        special_item = TocItem()
        # Need to determine level depth.
        # We don't have a heading, so get first content item
        content_item = dom.xpath("//p | //header | //img")
        if content_item is not None:
            parents = content_item[0].xpath(
                "./ancestor::*[name() = 'section' or name() = 'article']")
            special_item.level = len(parents)
            if special_item.level == 0:
                special_item.level = 1
        if nest_under_halftitle:
            special_item.level += 1
        special_item.title = dom.xpath(
            "//head/title/text()",
            True)  # Use the page title as the ToC entry title.
        if special_item.title is None:
            special_item.title = "NO TITLE"
        special_item.file_link = textf
        toc_list.append(special_item)
        return

    for heading in heads:
        # don't process a heading separately if it's within a hgroup
        if heading.parent.tag == "hgroup":
            continue  # skip it

        if place == Position.BODY:
            toc_item = process_a_heading(heading, textf, is_toplevel,
                                         single_file)
        else:
            # if it's not a bodymatter item we don't care about whether it's single_file
            toc_item = process_a_heading(heading, textf, is_toplevel, False)

        # Tricky check to see if we want to include the item because there's a halftitle
        # but only a single content file with no subsidiary sections.
        if is_toplevel and single_file and nest_under_halftitle and len(
                heads) == 1:
            continue
        if nest_under_halftitle:
            toc_item.level += 1
        is_toplevel = False
        toc_list.append(toc_item)
def process_headings(dom: EasyXmlTree, textf: str, toc_list: list,
                     nest_under_halftitle: bool, single_file: bool) -> None:
    """
	Find headings in current file and extract title data
	into items added to toc_list.

	INPUTS:
	dom: an EasyXmlTree representation of the current file
	textf: the path to the file
	toc_list: the list of ToC items we are building
	nest_under_halftitle: does this item need to be nested?
	single_file: is there only a single content item in the production?

	OUTPUTS:
	None
	"""

    body = dom.xpath("//body")
    place = Position.NONE
    if body:
        place = get_place(body[0])
    else:
        raise se.InvalidInputException("Couldn’t locate [xhtml]<body>[/].")

    is_toplevel = True

    # Find all the hgroups and h1, h2 etc headings.
    heads = dom.xpath("//hgroup | //h1 | //h2 | //h3 | //h4 | //h5 | //h6")

    # special treatment where we can't find any header or hgroups
    if not heads:  # May be a dedication or an epigraph, with no heading tag.
        if single_file and nest_under_halftitle:
            # There's a halftitle, but only this one content file with no subsections,
            # so leave out of ToC because the Toc will link to the halftitle.
            return
        special_item = TocItem()
        # Need to determine level depth.
        # We don't have a heading, so get first content item
        content_item = dom.xpath("//p | //header | //img")
        if content_item is not None:
            special_item.level = get_level(content_item[0], toc_list)
        special_item.title = dom.xpath(
            "//head/title/text()",
            True)  # Use the page title as the ToC entry title.
        if special_item.title is None:
            special_item.title = "NO TITLE"
        special_item.file_link = textf
        special_item.place = place
        toc_list.append(special_item)
        return

    for heading in heads:
        # don't process a heading separately if it's within a hgroup
        if heading.parent.tag == "hgroup":
            continue  # skip it

        if place == Position.BODY:
            toc_item = process_a_heading(heading, textf, is_toplevel,
                                         single_file)
        else:
            # if it's not a bodymatter item we don't care about whether it's single_file
            toc_item = process_a_heading(heading, textf, is_toplevel, False)

        toc_item.level = get_level(heading, toc_list)
        toc_item.place = place

        # Exception: The titlepage always has is titled 'titlepage' in the ToC
        if dom.xpath("//section[re:test(@epub:type, '\\btitlepage\\b')]"):
            toc_item.title = "Titlepage"

        is_toplevel = False
        toc_list.append(toc_item)
def add_landmark(dom: EasyXmlTree, textf: str, landmarks: list) -> None:
    """
	Adds an item to landmark list with appropriate details.

	INPUTS:
	dom: EasyXmlTree representation of the file we are indexing in ToC
	textf: path to the file
	landmarks: the list of landmark items we are building

	OUTPUTS:
	None
	"""

    # According to the IDPF a11y best practices page: <http://idpf.org/epub/a11y/techniques/#sem-003>:
    # > it is recommended to include a link to the start of the body matter as well as to any major
    # > reference sections (e.g., table of contents, endnotes, bibliography, glossary, index).
    #
    # So, we only want the start of the text, and (endnotes,glossary,bibliography,loi) in the landmarks.

    epub_type = ""
    sections = dom.xpath(
        "//body/*[name() = 'section' or name() = 'article' or name() = 'nav']")
    if not sections:
        raise se.InvalidInputException(
            "Couldn’t locate first [xhtml]<section>[/], [xhtml]<article>[/], or [xhtml]<nav>[/]."
        )
    epub_type = sections[0].get_attr("epub:type")
    bodys = dom.xpath("//body")
    if not bodys:
        raise se.InvalidInputException("Couldn’t locate [xhtml]<body>[/].")

    if not epub_type:  # some productions don't have an epub:type in outermost section, so get it from body tag
        epub_type = bodys[0].get_attr("epub:type")
        if not epub_type:
            epub_type = ""

    if epub_type in ["frontmatter", "bodymatter", "backmatter"]:
        return  # if epub_type is ONLY frontmatter, bodymatter, backmatter, we don't want this as a landmark

    if dom.xpath("//*[contains(@epub:type, 'frontmatter')]"):
        return  # We don't want frontmatter in the landmarks

    if dom.xpath(
            "//*[contains(@epub:type, 'backmatter')]") and not regex.findall(
                r"\b(loi|endnotes|bibliography|glossary|index)\b", epub_type):
        return  # We only want certain backmatter in the landmarks

    # We may wind up with a (front|body|back)matter semantic in epub_type, remove it here since we add it to the landmark later
    epub_type = regex.sub(r"(front|body|back)matter\s*", "", epub_type)

    landmark = TocItem()
    if epub_type:
        landmark.epub_type = epub_type
        landmark.file_link = textf
        landmark.place = get_place(bodys[0])
        if epub_type == "halftitlepage":
            landmark.title = "Half Title"
        elif epub_type == "titlepage":
            # Exception: The titlepage always has is titled 'titlepage' in the ToC
            landmark.title = "Titlepage"
        else:
            landmark.title = dom.xpath(
                "//head/title/text()",
                True)  # Use the page title as the landmark entry title.
            if landmark.title is None:
                # This is a bit desperate, use this only if there's no proper <title> tag in file.
                landmark.title = landmark.epub_type.capitalize()
        landmarks.append(landmark)
def process_headings(dom: EasyXmlTree, textf: str, toc_list: list,
                     single_file: bool,
                     single_file_without_headers: bool) -> None:
    """
	Find headings in current file and extract title data
	into items added to toc_list.

	INPUTS:
	dom: an EasyXmlTree representation of the current file
	textf: the path to the file
	toc_list: the list of ToC items we are building
	single_file: is there only a single content item in the production?

	OUTPUTS:
	None
	"""

    body = dom.xpath("//body")
    place = Position.NONE
    if body:
        place = get_place(body[0])
    else:
        raise se.InvalidInputException("Couldn’t locate [xhtml]<body>[/].")

    is_toplevel = True

    # Find all the hgroups and h1, h2 etc headings.
    heads = dom.xpath("//hgroup | //h1 | //h2 | //h3 | //h4 | //h5 | //h6")

    # special treatment where we can't find any header or hgroups
    if not heads:  # May be a dedication or an epigraph, with no heading tag.
        special_item = TocItem()
        # Need to determine level depth.
        # We don't have a heading, so get first content item
        content_item = dom.xpath("//p | //header | //img")
        if content_item is not None:
            special_item.level = get_level(content_item[0], toc_list)
        special_item.title = dom.xpath(
            "//head/title/text()",
            True)  # Use the page title as the ToC entry title.
        if special_item.title is None:
            special_item.title = "NO TITLE"
        special_item.file_link = textf
        special_item.place = place
        toc_list.append(special_item)
        return

    for heading in heads:
        # don't process a heading separately if it's within a hgroup
        if heading.parent.tag == "hgroup":
            continue  # skip it

        if place == Position.BODY:
            toc_item = process_a_heading(heading, textf, is_toplevel,
                                         single_file)
        else:
            # if it's not a bodymatter item we don't care about whether it's single_file
            toc_item = process_a_heading(heading, textf, is_toplevel, False)

        toc_item.level = get_level(heading, toc_list)
        toc_item.place = place

        # Exception: The titlepage always has is titled 'titlepage' in the ToC
        if dom.xpath("//section[re:test(@epub:type, '\\btitlepage\\b')]"):
            toc_item.title = "Titlepage"

        # Exception: If there is only a single body item WITHOUT HEADERS (like Father Goriot or The Path to Rome),
        # the half title page is listed as "Half-Titlepage" instead of the work title,
        # so that we don't duplicate the work title in the ToC. We always include a link to the work body
        # in the ToC because readers on the web version need to have access to the text starting point, since
        # there are no back/forward nav buttons in XHTML files served on the web.
        if single_file_without_headers and dom.xpath(
                "//section[re:test(@epub:type, '\\bhalftitlepage\\b')]"):
            toc_item.title = "Half-Titlepage"

        is_toplevel = False
        toc_list.append(toc_item)