示例#1
0
def output_toc(item_list: list, landmark_list, toc_path: str, work_type: str,
               work_title: str) -> str:
    """
	Outputs the contructed ToC based on the lists of items and landmarks found,
	either to stdout or overwriting the existing ToC file
	"""

    if len(item_list) < 2:
        raise se.InvalidInputException("Too few ToC items found.")

    existing_toc: BeautifulSoup = get_existing_toc(toc_path)
    if existing_toc is None:
        raise se.InvalidInputException("Existing ToC not found.")

    # There should be exactly two nav sections.
    navs = existing_toc.find_all("nav")

    if len(navs) < 2:
        raise se.InvalidInputException(
            "Existing ToC has too few nav sections.")

    item_ol = navs[0].find("ol")
    item_ol.clear()
    landmark_ol = navs[1].find("ol")
    landmark_ol.clear()
    new_items = BeautifulSoup(process_items(item_list), "html.parser")
    item_ol.append(new_items)
    new_landmarks = BeautifulSoup(
        process_landmarks(landmark_list, work_type, work_title), "html.parser")
    landmark_ol.append(new_landmarks)
    return format_xhtml(str(existing_toc))
def output_toc(item_list: list, landmark_list, toc_path: str, work_type: str,
               work_title: str) -> str:
    """
	Outputs the contructed ToC based on the lists of items and landmarks found,
	either to stdout or overwriting the existing ToC file

	INPUTS:
	item_list: list of ToC items (the first part of the ToC)
	landmark_list: list of landmark items (the second part of the ToC)
	work_type: "fiction" or "non-fiction"
	work_title: the title of the book

	OUTPUTS:
	a html string representing the new ToC
	"""

    if len(item_list) < 2:
        raise se.InvalidInputException("Too few ToC items found.")

    try:
        with open(toc_path, encoding="utf8") as file:
            toc_dom = se.easy_xml.EasyXhtmlTree(file.read())
    except Exception as ex:
        raise se.InvalidInputException(
            f"Existing ToC not found. Exception: {ex}")

    # There should be exactly two nav sections.
    navs = toc_dom.xpath("//nav")

    if len(navs) < 2:
        raise se.InvalidInputException(
            "Existing ToC has too few nav sections.")

    # now remove and then re-add the ol sections to clear them
    for nav in navs:
        ols = nav.xpath("./ol")  # just want the immediate ol children
        for ol_item in ols:
            ol_item.remove()

    # this is ugly and stupid, but I can't figure out an easier way to do it
    item_ol = EasyXmlElement(etree.Element("ol"), toc_dom.namespaces)
    item_ol.lxml_element.text = "TOC_ITEMS"
    navs[0].append(item_ol)
    landmark_ol = EasyXmlElement(etree.Element("ol"), toc_dom.namespaces)
    landmark_ol.lxml_element.text = "LANDMARK_ITEMS"
    navs[1].append(landmark_ol)
    xhtml = toc_dom.to_string()
    xhtml = xhtml.replace("TOC_ITEMS", process_items(item_list))
    xhtml = xhtml.replace(
        "LANDMARK_ITEMS",
        process_landmarks(landmark_list, work_type, work_title))

    return se.formatting.format_xhtml(xhtml)
示例#3
0
def add_landmark(dom: EasyXmlTree, textf: str, landmarks: list) -> None:
    """
	Adds an item to landmark list with appropriate details.

	INPUTS:
	dom: EasyXmlTree representation of the file we are indexing in ToC
	textf: path to the file
	landmarks: the list of landmark items we are building

	OUTPUTS:
	None
	"""

    epub_type = ""
    sections = dom.xpath("//body/*[name() = 'section' or name() = 'article']")
    if not sections:
        raise se.InvalidInputException(
            "Couldn’t locate first [xhtml]<section>[/] or [xhtml]<article>[/]."
        )
    epub_type = sections[0].get_attr("epub:type")
    bodys = dom.xpath("//body")
    if not bodys:
        raise se.InvalidInputException("Couldn’t locate [xhtml]<body>[/].")

    if not epub_type:  # some productions don't have an epub:type in outermost section, so get it from body tag
        epub_type = bodys[0].get_attr("epub:type")
        if not epub_type:
            epub_type = ""

    if epub_type in ["frontmatter", "bodymatter", "backmatter"]:
        return  # if epub_type is ONLY frontmatter, bodymatter, backmatter, we don't want this as a landmark

    # We may wind up with a (front|body|back)matter semantic in epub_type, remove it here since we add it to the landmark later
    epub_type = regex.sub(r"(front|body|back)matter\s*", "", epub_type)

    landmark = TocItem()
    if epub_type:
        landmark.epub_type = epub_type
        landmark.file_link = textf
        landmark.place = get_place(bodys[0])
        if epub_type == "halftitlepage":
            landmark.title = "Half Title"
        else:
            landmark.title = dom.xpath(
                "//head/title/text()",
                True)  # Use the page title as the landmark entry title.
            if landmark.title is None:
                # This is a bit desperate, use this only if there's no proper <title> tag in file.
                landmark.title = landmark.epub_type.capitalize()
        landmarks.append(landmark)
示例#4
0
	def endnotes(self) -> list:
		"""
		Accessor

		Return a list of Endnote objects representing the endnotes file for this ebook.

		INPUTS
		None

		OUTPUTS
		A list of Endnote objects representing the endnotes file for this ebook.
		"""

		if not self._endnotes:
			self._endnotes = []

			dom = self.get_dom(self.endnotes_path)

			for node in dom.xpath("/html/body/section[contains(@epub:type, 'endnotes')]/ol/li[contains(@epub:type, 'endnote')]"):
				note = Endnote()
				note.node = node
				try:
					note.number = int(node.get_attr("id").replace("note-", ""))
				except ValueError:
					note.number = 0
				note.contents = node.xpath("./*")
				note.anchor = node.get_attr("id") or ""

				for back_link in node.xpath(".//a[contains(@epub:type, 'backlink')]/@href"):
					note.back_link = back_link
				if not note.back_link:
					raise se.InvalidInputException(f"No backlink found in note {note.anchor} in existing endnotes file.")
				self._endnotes.append(note)
		return self._endnotes
    def toc_link(self) -> str:
        """
		Generates the hyperlink for the ToC item.

		INPUTS:
		None

		OUTPUTS:
		the linking tag line eg <a href=... depending on the data found.
		"""

        out_string = ""
        if not self.title:
            raise se.InvalidInputException(
                f"Couldn’t find title in: [path][link=file://{self.file_link}]{self.file_link}[/][/]."
            )

        if self.subtitle and self.lang:
            # test for a foreign language subtitle, and adjust accordingly
            self.subtitle = f"<span xml:lang=\"{self.lang}\">{self.subtitle}</span>"

        # If the title is entirely Roman numeral, put epub:type within <a>.
        if regex.search(r"^<span epub:type=\"z3998:roman\">[IVXLC]+<\/span>$",
                        self.title):
            # title is a pure roman number
            if self.subtitle == "":  # put the roman flag inside the <a> tag
                out_string += f"<a href=\"text/{self.file_link}\" epub:type=\"z3998:roman\">{self.roman}</a>\n"
            else:
                out_string += f"<a href=\"text/{self.file_link}\"><span epub:type=\"z3998:roman\">{self.roman}</span>: {self.subtitle}</a>\n"
        else:
            # title has text other than a roman numeral
            if self.subtitle != "" and (self.hidden or self.title_is_ordinal or
                                        (self.division in [
                                            BookDivision.PART,
                                            BookDivision.DIVISION,
                                            BookDivision.VOLUME
                                        ])):
                # Use the subtitle only if we're a Part or Division or Volume or if title was an ordinal
                out_string += f"<a href=\"text/{self.file_link}\">{self.title}"

                # Don't append a colon if the ordinal already ends in punctuation, for example  `1.` or `(a)`
                if not regex.search(r"\p{Punctuation}$", self.title):
                    out_string += ":"

                out_string += f" {self.subtitle}</a>\n"
            else:
                # test for a foreign language title, and adjust accordingly
                if self.lang:
                    out_string += f"<a href=\"text/{self.file_link}\" xml:lang=\"{self.lang}\">{self.title}</a>\n"
                else:
                    out_string += f"<a href=\"text/{self.file_link}\">{self.title}</a>\n"

        # Replace <br/> with a single space
        out_string = regex.sub(r"<br/>\s*",
                               " ",
                               out_string,
                               flags=regex.DOTALL)

        return out_string
def process_all_content(file_list: list, text_path: str) -> Tuple[list, list]:
    """
	Analyze the whole content of the project, build and return lists
	if toc_items and landmarks.

	INPUTS:
	file_list: a list of all content files
	text_path: the path to the contents folder (src/epub/text)

	OUTPUTS:
	a tuple containing the list of Toc items and the list of landmark items
	"""

    toc_list: List[TocItem] = []
    landmarks: List[TocItem] = []

    # We make two passes through the work, because we need to know
    # how many bodymatter items there are. So we do landmarks first.
    for textf in file_list:
        file_path = Path(text_path) / textf
        try:
            with open(file_path, encoding="utf8") as file:
                dom = se.easy_xml.EasyXhtmlTree(file.read())
        except Exception as ex:
            raise se.InvalidFileException(
                f"Couldn’t open file: [path][link=file://{file_path}]{file_path}[/][/]. Exception: {ex}"
            ) from ex

        add_landmark(dom, textf, landmarks)

    # Now we test to see if there is only one body item
    body_items = [item for item in landmarks if item.place == Position.BODY]
    single_file = (len(body_items) == 1)

    nest_under_halftitle = False
    place = Position.NONE
    for textf in file_list:
        with open(Path(text_path) / textf, "r", encoding="utf-8") as file:
            dom = se.easy_xml.EasyXhtmlTree(file.read())
        body = dom.xpath("//body")
        if body:
            place = get_place(body[0])
        else:
            raise se.InvalidInputException("Couldn't locate body node")
        if place == Position.BACK:
            nest_under_halftitle = False
        process_headings(dom, textf, toc_list, nest_under_halftitle,
                         single_file)
        if textf == "halftitle.xhtml":
            nest_under_halftitle = True

    # We add this dummy item because outputtoc always needs to look ahead to the next item.
    last_toc = TocItem()
    last_toc.level = 1
    last_toc.title = "dummy"
    toc_list.append(last_toc)

    return landmarks, toc_list
示例#7
0
def process_heading(heading: BeautifulSoup, textf: str, is_toplevel: bool,
                    single_file: bool) -> TocItem:
    """
	Generate and return a TocItem from this heading.

	INPUTS:
	heading: a BeautifulSoup tag representing a heading tag
	text: the path to the file
	is_toplevel: is this heading at the top-most level in the file?
	single_file: is there only one content file in the production (like some Poetry volumes)?

	OUTPUTS:
	a qualified ToCItem object
	"""

    toc_item = TocItem()
    parent_sections = heading.find_parents(["section", "article"])
    if parent_sections:
        toc_item.level = len(parent_sections)
    else:
        toc_item.level = 1

    try:
        toc_item.division = get_book_division(heading)
    except se.InvalidInputException:
        raise se.InvalidInputException(
            f"Couldn’t identify parent section in file: [path][link=file://{textf}]{textf}[/][/]."
        )

    # This stops the first heading in a file getting an anchor id, we don't generally want that.
    # The exceptions are things like poems within a single-file volume.
    toc_item.id = get_parent_id(heading)  # pylint: disable=invalid-name
    if toc_item.id == "":
        toc_item.file_link = textf
    else:
        if not is_toplevel:
            toc_item.file_link = f"{textf}#{toc_item.id}"
        elif single_file:  # It IS the first heading in the file, but there's only a single content file?
            toc_item.file_link = f"{textf}#{toc_item.id}"
        else:
            toc_item.file_link = textf

    toc_item.lang = heading.get("xml:lang") or ""

    # A heading may include z3998:roman directly,
    # eg <h5 epub:type="title z3998:roman">II</h5>.
    attribs = heading.get("epub:type") or ""

    if "z3998:roman" in attribs:
        toc_item.roman = extract_strings(heading)
        toc_item.title = f"<span epub:type=\"z3998:roman\">{toc_item.roman}</span>"
        return toc_item

    process_heading_contents(heading, toc_item)

    return toc_item
示例#8
0
	def __process_link(self, change_list, current_note_number, file_name, link, needs_rewrite, notes_changed) -> Tuple[bool, int]:
		"""
		Checks each endnote link to see if the existing anchor needs to be updated with a new number

		Returns a tuple of needs_write (whether object needs to be re-written), and the number of notes_changed
		"""

		old_anchor = ""
		href = link.get_attr("href") or ""
		if href:
			# Extract just the anchor from a URL (ie, what follows a hash symbol)
			hash_position = href.find("#") + 1  # we want the characters AFTER the hash
			if hash_position > 0:
				old_anchor = href[hash_position:]

		new_anchor = f"note-{current_note_number:d}"
		if new_anchor != old_anchor:
			change_list.append(f"Changed {old_anchor} to {new_anchor} in {file_name}")
			notes_changed += 1
			# Update the link in the dom
			link.set_attr("href", f"{self.endnotes_path.name}#{new_anchor}")
			link.set_attr("id", f"noteref-{current_note_number:d}")
			link.lxml_element.text = str(current_note_number)
			needs_rewrite = True
		# Now try to find this in endnotes
		match_old = lambda x, old=old_anchor: x.anchor == old
		matches = list(filter(match_old, self.endnotes))
		if not matches:
			raise se.InvalidInputException(f"Couldn’t find endnote with anchor [attr]{old_anchor}[/].")
		if len(matches) > 1:
			raise se.InvalidInputException(f"Duplicate anchors in endnotes file for anchor [attr]{old_anchor}[/].")
		# Found a single match, which is what we want
		endnote = matches[0]
		endnote.number = current_note_number
		endnote.matched = True
		# We don't change the anchor or the back ref just yet
		endnote.source_file = file_name
		return needs_rewrite, notes_changed
示例#9
0
def get_epub_type(soup: BeautifulSoup) -> str:
    """
	Retrieve the epub_type of this file to see if it"s a landmark item.
	"""

    # Try for a heading.
    first_head = soup.find(["h1", "h2", "h3", "h4", "h5", "h6"])
    if first_head is not None:
        parent = first_head.find_parent(["section", "article", "body"])
    else:  # No heading found so go hunting for some other content.
        paragraph = soup.find(["p", "header",
                               "img"])  # We look for the first such item.
        if paragraph is not None:
            parent = paragraph.find_parent(["section", "article", "body"])
        else:
            return ""

    if parent is None:
        raise se.InvalidInputException("Couldn't find any section grouping.")
    else:
        try:
            return parent["epub:type"]
        except KeyError:
            return ""
def add_landmark(dom: EasyXmlTree, textf: str, landmarks: list) -> None:
    """
	Adds an item to landmark list with appropriate details.

	INPUTS:
	dom: EasyXmlTree representation of the file we are indexing in ToC
	textf: path to the file
	landmarks: the list of landmark items we are building

	OUTPUTS:
	None
	"""

    # According to the IDPF a11y best practices page: <http://idpf.org/epub/a11y/techniques/#sem-003>:
    # > it is recommended to include a link to the start of the body matter as well as to any major
    # > reference sections (e.g., table of contents, endnotes, bibliography, glossary, index).
    #
    # So, we only want the start of the text, and (endnotes,glossary,bibliography,loi) in the landmarks.

    epub_type = ""
    sections = dom.xpath(
        "//body/*[name() = 'section' or name() = 'article' or name() = 'nav']")
    if not sections:
        raise se.InvalidInputException(
            "Couldn’t locate first [xhtml]<section>[/], [xhtml]<article>[/], or [xhtml]<nav>[/]."
        )
    epub_type = sections[0].get_attr("epub:type")
    bodys = dom.xpath("//body")
    if not bodys:
        raise se.InvalidInputException("Couldn’t locate [xhtml]<body>[/].")

    if not epub_type:  # some productions don't have an epub:type in outermost section, so get it from body tag
        epub_type = bodys[0].get_attr("epub:type")
        if not epub_type:
            epub_type = ""

    if epub_type in ["frontmatter", "bodymatter", "backmatter"]:
        return  # if epub_type is ONLY frontmatter, bodymatter, backmatter, we don't want this as a landmark

    if dom.xpath("//*[contains(@epub:type, 'frontmatter')]"):
        return  # We don't want frontmatter in the landmarks

    if dom.xpath(
            "//*[contains(@epub:type, 'backmatter')]") and not regex.findall(
                r"\b(loi|endnotes|bibliography|glossary|index)\b", epub_type):
        return  # We only want certain backmatter in the landmarks

    # We may wind up with a (front|body|back)matter semantic in epub_type, remove it here since we add it to the landmark later
    epub_type = regex.sub(r"(front|body|back)matter\s*", "", epub_type)

    landmark = TocItem()
    if epub_type:
        landmark.epub_type = epub_type
        landmark.file_link = textf
        landmark.place = get_place(bodys[0])
        if epub_type == "halftitlepage":
            landmark.title = "Half Title"
        elif epub_type == "titlepage":
            # Exception: The titlepage always has is titled 'titlepage' in the ToC
            landmark.title = "Titlepage"
        else:
            landmark.title = dom.xpath(
                "//head/title/text()",
                True)  # Use the page title as the landmark entry title.
            if landmark.title is None:
                # This is a bit desperate, use this only if there's no proper <title> tag in file.
                landmark.title = landmark.epub_type.capitalize()
        landmarks.append(landmark)
示例#11
0
	def generate_endnotes(self) -> Tuple[int, int]:
		"""
		Read the epub spine to regenerate all endnotes in order of appearance, starting from 1.
		Changes are written to disk.

		Returns a tuple of (found_endnote_count, changed_endnote_count)
		"""

		# Do a safety check first, throw exception if it failed
		results = self._check_endnotes()
		if results:
			report = "\n".join(results)
			raise se.InvalidInputException(f"Endnote error(s) found: {report}.")

		# If we get here, it's safe to proceed
		processed = 0
		current_note_number = 1
		notes_changed = 0
		change_list: List[str] = []

		for file_path in self.spine_file_paths:
			dom = self.get_dom(file_path)

			# Skip the actual endnotes file, we'll handle that later
			if dom.xpath("/html/body//*[contains(@epub:type, 'endnotes')]"):
				continue

			processed += 1

			needs_rewrite = False
			for link in dom.xpath("/html/body//a[contains(@epub:type, 'noteref')]"):
				needs_rewrite, notes_changed = self.__process_link(change_list, current_note_number, file_path.name, link, needs_rewrite, notes_changed)
				current_note_number += 1

			# If we need to write back the body text file
			if needs_rewrite:
				with open(file_path, "w") as file:
					file.write(se.formatting.format_xhtml(dom.to_string()))

		# Now process any endnotes WITHIN the endnotes
		for source_note in self.endnotes:
			node = source_note.node
			needs_rewrite = False
			for link in node.xpath(".//a[contains(@epub:type, 'noteref')]"):
				needs_rewrite, notes_changed = self.__process_link(change_list, current_note_number, self.endnotes_path.name, link, needs_rewrite, notes_changed)
				current_note_number += 1

		if processed == 0:
			raise se.InvalidInputException("No files processed. Did you update the manifest and order the spine?")

		if notes_changed > 0:
			# Now we need to recreate the endnotes file
			endnotes_dom = self.get_dom(self.endnotes_path)
			for ol_node in endnotes_dom.xpath("/html/body/section[contains(@epub:type, 'endnotes')]/ol[1]"):
				for node in ol_node.xpath("./li[contains(@epub:type, 'endnote')]"):
					node.remove()

				self.endnotes.sort(key=lambda endnote: endnote.number)

				for endnote in self.endnotes:
					if endnote.matched:
						endnote.node.set_attr("id", f"note-{endnote.number}")

						for node in endnote.node.xpath(".//a[contains(@epub:type, 'backlink')]"):
							node.set_attr("href", f"{endnote.source_file}#noteref-{endnote.number}")

						ol_node.append(endnote.node)

			with open(self.endnotes_path, "w") as file:
				file.write(se.formatting.format_xhtml(endnotes_dom.to_string()))

		return current_note_number - 1, notes_changed
示例#12
0
def _create_draft(args: Namespace):
    """
	Implementation for `se create-draft`
	"""

    # Put together some variables for later use
    identifier = se.formatting.make_url_safe(
        args.author) + "/" + se.formatting.make_url_safe(args.title)
    title_string = args.title.replace(
        "'", "’") + ", by " + args.author.replace("'", "’")
    sorted_title = regex.sub(r"^(A|An|The) (.+)$", "\\2, \\1", args.title)
    pg_producers = []

    if args.translator:
        identifier = identifier + "/" + se.formatting.make_url_safe(
            args.translator)
        title_string = title_string + ". Translated by " + args.translator

    if args.illustrator:
        identifier = identifier + "/" + se.formatting.make_url_safe(
            args.illustrator)
        title_string = title_string + ". Illustrated by " + args.illustrator

    repo_name = identifier.replace("/", "_")

    repo_path = Path(repo_name).resolve()

    if repo_path.is_dir():
        raise se.InvalidInputException(
            f"Directory already exists: [path][link=file://{repo_path}]{repo_path}[/][/]."
        )

    # Download PG HTML and do some fixups
    if args.pg_url:
        if args.offline:
            raise se.RemoteCommandErrorException(
                "Cannot download Project Gutenberg ebook when offline option is enabled."
            )

        args.pg_url = args.pg_url.replace("http://", "https://")

        # Get the ebook metadata
        try:
            response = requests.get(args.pg_url)
            pg_metadata_html = response.text
        except Exception as ex:
            raise se.RemoteCommandErrorException(
                f"Couldn’t download Project Gutenberg ebook metadata page. Exception: {ex}"
            )

        soup = BeautifulSoup(pg_metadata_html, "lxml")

        # Get the ebook HTML URL from the metadata
        pg_ebook_url = None
        for element in soup.select("a[type^=\"text/html\"]"):
            pg_ebook_url = regex.sub(r"^//", "https://", element["href"])
            pg_ebook_url = regex.sub(r"^/", "https://www.gutenberg.org/",
                                     pg_ebook_url)

        if not pg_ebook_url:
            raise se.RemoteCommandErrorException(
                "Could download ebook metadata, but couldn’t find URL for the ebook HTML."
            )

        # Get the ebook LCSH categories
        pg_subjects = []
        for element in soup.select("td[property=\"dcterms:subject\"]"):
            if element["datatype"] == "dcterms:LCSH":
                for subject_link in element.find("a"):
                    pg_subjects.append(subject_link.strip())

        # Get the PG publication date
        pg_publication_year = None
        for element in soup.select("td[itemprop=\"datePublished\"]"):
            pg_publication_year = regex.sub(r".+?([0-9]{4})", "\\1",
                                            element.text)

        # Get the actual ebook URL
        try:
            response = requests.get(pg_ebook_url)
            pg_ebook_html = response.text
        except Exception as ex:
            raise se.RemoteCommandErrorException(
                f"Couldn’t download Project Gutenberg ebook HTML. Exception: {ex}"
            )

        try:
            fixed_pg_ebook_html = fix_text(pg_ebook_html, uncurl_quotes=False)
            pg_ebook_html = se.strip_bom(fixed_pg_ebook_html)
        except Exception as ex:
            raise se.InvalidEncodingException(
                f"Couldn’t determine text encoding of Project Gutenberg HTML file. Exception: {ex}"
            )

        # Try to guess the ebook language
        pg_language = "en-US"
        if "colour" in pg_ebook_html or "favour" in pg_ebook_html or "honour" in pg_ebook_html:
            pg_language = "en-GB"

    # Create necessary directories
    (repo_path / "images").mkdir(parents=True)
    (repo_path / "src" / "epub" / "css").mkdir(parents=True)
    (repo_path / "src" / "epub" / "images").mkdir(parents=True)
    (repo_path / "src" / "epub" / "text").mkdir(parents=True)
    (repo_path / "src" / "META-INF").mkdir(parents=True)

    is_pg_html_parsed = True

    # Write PG data if we have it
    if args.pg_url and pg_ebook_html:
        try:
            soup = BeautifulSoup(pg_ebook_html, "html.parser")

            # Try to get the PG producers.  We only try this if there's a <pre> block with the header info (which is not always the case)
            for element in soup(text=regex.compile(r"\*\*\*\s*Produced by.+$",
                                                   flags=regex.DOTALL)):
                if element.parent.name == "pre":
                    producers_text = regex.sub(r".+?Produced by (.+?)\s*$",
                                               "\\1",
                                               element,
                                               flags=regex.DOTALL)
                    producers_text = regex.sub(r"\(.+?\)",
                                               "",
                                               producers_text,
                                               flags=regex.DOTALL)
                    producers_text = regex.sub(
                        r"(at )?https?://www\.pgdp\.net",
                        "",
                        producers_text,
                        flags=regex.DOTALL)
                    producers_text = regex.sub(r"[\r\n]+",
                                               " ",
                                               producers_text,
                                               flags=regex.DOTALL)
                    producers_text = regex.sub(r",? and ", ", and ",
                                               producers_text)
                    producers_text = producers_text.replace(
                        " and the Online", " and The Online")
                    producers_text = producers_text.replace(", and ",
                                                            ", ").strip()

                    pg_producers = producers_text.split(", ")

            # Try to strip out the PG header
            for element in soup(text=regex.compile(r"\*\*\*\s*START OF THIS")):
                for sibling in element.parent.find_previous_siblings():
                    sibling.decompose()

                element.parent.decompose()

            # Try to strip out the PG license footer
            for element in soup(
                    text=regex.compile(r"End of (the )?Project Gutenberg")):
                for sibling in element.parent.find_next_siblings():
                    sibling.decompose()

                element.parent.decompose()

            with open(repo_path / "src" / "epub" / "text" / "body.xhtml",
                      "w",
                      encoding="utf-8") as file:
                file.write(str(soup))
        except OSError as ex:
            raise se.InvalidFileException(
                f"Couldn’t write to ebook directory. Exception: {ex}")
        except:
            # Save this error for later, because it's still useful to complete the create-draft process
            # even if we've failed to parse PG's HTML source.
            is_pg_html_parsed = False
            se.quiet_remove(repo_path / "src" / "epub" / "text" / "body.xhtml")

    # Copy over templates

    _copy_template_file("gitignore", repo_path / ".gitignore")
    _copy_template_file("LICENSE.md", repo_path)
    _copy_template_file("container.xml", repo_path / "src" / "META-INF")
    _copy_template_file("mimetype", repo_path / "src")
    _copy_template_file("content.opf", repo_path / "src" / "epub")
    _copy_template_file("onix.xml", repo_path / "src" / "epub")
    _copy_template_file("toc.xhtml", repo_path / "src" / "epub")
    _copy_template_file("core.css", repo_path / "src" / "epub" / "css")
    _copy_template_file("local.css", repo_path / "src" / "epub" / "css")
    _copy_template_file("logo.svg", repo_path / "src" / "epub" / "images")
    _copy_template_file("colophon.xhtml", repo_path / "src" / "epub" / "text")
    _copy_template_file("imprint.xhtml", repo_path / "src" / "epub" / "text")
    _copy_template_file("titlepage.xhtml", repo_path / "src" / "epub" / "text")
    _copy_template_file("uncopyright.xhtml",
                        repo_path / "src" / "epub" / "text")
    _copy_template_file("titlepage.svg", repo_path / "images")
    _copy_template_file("cover.jpg", repo_path / "images" / "cover.jpg")
    _copy_template_file("cover.svg", repo_path / "images" / "cover.svg")

    # Try to find Wikipedia links if possible
    if args.offline:
        author_wiki_url = None
        author_nacoaf_url = None
        ebook_wiki_url = None
        translator_wiki_url = None
        translator_nacoaf_url = None
    else:
        author_wiki_url, author_nacoaf_url = _get_wikipedia_url(
            args.author, True)
        ebook_wiki_url = None
        if args.title != "Short Fiction":
            # There's a "Short Fiction" Wikipedia article, so make an exception for that case
            ebook_wiki_url, _ = _get_wikipedia_url(args.title, False)
        translator_wiki_url = None
        if args.translator:
            translator_wiki_url, translator_nacoaf_url = _get_wikipedia_url(
                args.translator, True)

    # Pre-fill a few templates
    _replace_in_file(repo_path / "src" / "epub" / "text" / "titlepage.xhtml",
                     "TITLE_STRING", title_string)
    _replace_in_file(repo_path / "images" / "titlepage.svg", "TITLE_STRING",
                     title_string)
    _replace_in_file(repo_path / "images" / "cover.svg", "TITLE_STRING",
                     title_string)

    # Create the titlepage SVG
    contributors = {}
    if args.translator:
        contributors["translated by"] = args.translator

    if args.illustrator:
        contributors["illustrated by"] = args.illustrator

    with open(repo_path / "images" / "titlepage.svg", "w",
              encoding="utf-8") as file:
        file.write(
            _generate_titlepage_svg(args.title, args.author, contributors,
                                    title_string))

    # Create the cover SVG
    with open(repo_path / "images" / "cover.svg", "w",
              encoding="utf-8") as file:
        file.write(_generate_cover_svg(args.title, args.author, title_string))

    # Build the cover/titlepage for distribution
    epub = SeEpub(repo_path)
    epub.generate_cover_svg()
    epub.generate_titlepage_svg()

    if args.pg_url:
        _replace_in_file(repo_path / "src" / "epub" / "text" / "imprint.xhtml",
                         "PG_URL", args.pg_url)

    with open(repo_path / "src" / "epub" / "text" / "colophon.xhtml",
              "r+",
              encoding="utf-8") as file:
        colophon_xhtml = file.read()

        colophon_xhtml = colophon_xhtml.replace("SE_IDENTIFIER", identifier)
        colophon_xhtml = colophon_xhtml.replace(">AUTHOR<", f">{args.author}<")
        colophon_xhtml = colophon_xhtml.replace("TITLE", args.title)

        if author_wiki_url:
            colophon_xhtml = colophon_xhtml.replace("AUTHOR_WIKI_URL",
                                                    author_wiki_url)

        if args.pg_url:
            colophon_xhtml = colophon_xhtml.replace("PG_URL", args.pg_url)

            if pg_publication_year:
                colophon_xhtml = colophon_xhtml.replace(
                    "PG_YEAR", pg_publication_year)

            if pg_producers:
                producers_xhtml = ""
                for i, producer in enumerate(pg_producers):
                    if "Distributed Proofread" in producer:
                        producers_xhtml = producers_xhtml + "<a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a>"
                    elif "anonymous" in producer.lower():
                        producers_xhtml = producers_xhtml + "<b class=\"name\">An Anonymous Volunteer</b>"
                    else:
                        producers_xhtml = producers_xhtml + f"<b class=\"name\">{producer.strip('.')}</b>"

                    if i < len(pg_producers) - 1:
                        producers_xhtml = producers_xhtml + ", "

                    if i == len(pg_producers) - 2:
                        producers_xhtml = producers_xhtml + "and "

                producers_xhtml = producers_xhtml + "<br/>"

                colophon_xhtml = colophon_xhtml.replace(
                    "<b class=\"name\">TRANSCRIBER_1</b>, <b class=\"name\">TRANSCRIBER_2</b>, and <a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a><br/>",
                    producers_xhtml)

        file.seek(0)
        file.write(colophon_xhtml)
        file.truncate()

    with open(repo_path / "src" / "epub" / "content.opf",
              "r+",
              encoding="utf-8") as file:
        metadata_xml = file.read()

        metadata_xml = metadata_xml.replace("SE_IDENTIFIER", identifier)
        metadata_xml = metadata_xml.replace(">AUTHOR<", f">{args.author}<")
        metadata_xml = metadata_xml.replace(">TITLE_SORT<",
                                            f">{sorted_title}<")
        metadata_xml = metadata_xml.replace(">TITLE<", f">{args.title}<")
        metadata_xml = metadata_xml.replace("VCS_IDENTIFIER", str(repo_name))

        if pg_producers:
            producers_xhtml = ""
            i = 1
            for producer in pg_producers:
                if "Distributed Proofread" in producer:
                    producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">The Online Distributed Proofreading Team</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Online Distributed Proofreading Team, The</meta>\n\t\t<meta property=\"se:url.homepage\" refines=\"#transcriber-{i}\">https://pgdp.net</meta>\n"
                elif "anonymous" in producer.lower():
                    producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">An Anonymous Volunteer</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Anonymous Volunteer, An</meta>\n"
                else:
                    producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">{producer.strip('.')}</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">TRANSCRIBER_SORT</meta>\n"

                producers_xhtml = producers_xhtml + f"\t\t<meta property=\"role\" refines=\"#transcriber-{i}\" scheme=\"marc:relators\">trc</meta>\n"

                i = i + 1

            metadata_xml = regex.sub(
                r"\t\t<dc:contributor id=\"transcriber-1\">TRANSCRIBER</dc:contributor>\s*<meta property=\"file-as\" refines=\"#transcriber-1\">TRANSCRIBER_SORT</meta>\s*<meta property=\"se:url.homepage\" refines=\"#transcriber-1\">TRANSCRIBER_URL</meta>\s*<meta property=\"role\" refines=\"#transcriber-1\" scheme=\"marc:relators\">trc</meta>",
                "\t\t" + producers_xhtml.strip(),
                metadata_xml,
                flags=regex.DOTALL)

        if author_wiki_url:
            metadata_xml = metadata_xml.replace(">AUTHOR_WIKI_URL<",
                                                f">{author_wiki_url}<")

        if author_nacoaf_url:
            metadata_xml = metadata_xml.replace(">AUTHOR_NACOAF_URL<",
                                                f">{author_nacoaf_url}<")

        if ebook_wiki_url:
            metadata_xml = metadata_xml.replace(">EBOOK_WIKI_URL<",
                                                f">{ebook_wiki_url}<")

        if args.translator:
            metadata_xml = metadata_xml.replace(">TRANSLATOR<",
                                                f">{args.translator}<")

            if translator_wiki_url:
                metadata_xml = metadata_xml.replace(
                    ">TRANSLATOR_WIKI_URL<", f">{translator_wiki_url}<")

            if translator_nacoaf_url:
                metadata_xml = metadata_xml.replace(
                    ">TRANSLATOR_NACOAF_URL<", f">{translator_nacoaf_url}<")
        else:
            metadata_xml = regex.sub(
                r"<dc:contributor id=\"translator\">.+?<dc:contributor id=\"artist\">",
                "<dc:contributor id=\"artist\">",
                metadata_xml,
                flags=regex.DOTALL)

        if args.pg_url:
            if pg_subjects:
                subject_xhtml = ""

                i = 1
                for subject in pg_subjects:
                    subject_xhtml = subject_xhtml + f"\t\t<dc:subject id=\"subject-{i}\">{subject}</dc:subject>\n"
                    i = i + 1

                i = 1
                for subject in pg_subjects:
                    subject_xhtml = subject_xhtml + f"\t\t<meta property=\"authority\" refines=\"#subject-{i}\">LCSH</meta>\n"

                    # Now, get the LCSH ID by querying LCSH directly.
                    try:
                        response = requests.get(
                            f"https://id.loc.gov/search/?q=%22{urllib.parse.quote(subject)}%22"
                        )
                        result = regex.search(
                            fr"<a title=\"Click to view record\" href=\"/authorities/subjects/([^\"]+?)\">{regex.escape(subject.replace(' -- ', '--'))}</a>",
                            response.text)

                        loc_id = "Unknown"
                        try:
                            loc_id = result.group(1)
                        except Exception as ex:
                            pass

                        subject_xhtml = subject_xhtml + f"\t\t<meta property=\"term\" refines=\"#subject-{i}\">{loc_id}</meta>\n"

                    except Exception as ex:
                        raise se.RemoteCommandErrorException(
                            f"Couldn’t connect to [url][link=https://id.loc.gov]https://id.loc.gov[/][/]. Exception: {ex}"
                        )

                    i = i + 1

                metadata_xml = regex.sub(
                    r"\t\t<dc:subject id=\"subject-1\">SUBJECT_1</dc:subject>\s*<dc:subject id=\"subject-2\">SUBJECT_2</dc:subject>\s*<meta property=\"authority\" refines=\"#subject-1\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-1\">LCSH_ID_1</meta>\s*<meta property=\"authority\" refines=\"#subject-2\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-2\">LCSH_ID_2</meta>",
                    "\t\t" + subject_xhtml.strip(), metadata_xml)

            metadata_xml = metadata_xml.replace(
                "<dc:language>LANG</dc:language>",
                f"<dc:language>{pg_language}</dc:language>")
            metadata_xml = metadata_xml.replace(
                "<dc:source>PG_URL</dc:source>",
                f"<dc:source>{args.pg_url}</dc:source>")

        file.seek(0)
        file.write(metadata_xml)
        file.truncate()

    # Set up local git repo
    repo = git.Repo.init(repo_path)

    if args.email:
        with repo.config_writer() as config:
            config.set_value("user", "email", args.email)

    if args.pg_url and pg_ebook_html and not is_pg_html_parsed:
        raise se.InvalidXhtmlException(
            "Couldn’t parse Project Gutenberg ebook source. This is usually due to invalid HTML in the ebook."
        )
示例#13
0
def build(self, metadata_xhtml, metadata_tree, run_epubcheck, build_kobo, build_kindle, output_directory, proof, build_covers, verbose):
	"""
	Entry point for `se build`
	"""

	calibre_app_mac_path = "/Applications/calibre.app/Contents/MacOS/"
	epubcheck_path = shutil.which("epubcheck")
	ebook_convert_path = shutil.which("ebook-convert")
	# Look for default Mac calibre app path if none found in path
	if ebook_convert_path is None and os.path.exists(calibre_app_mac_path):
		ebook_convert_path = os.path.join(calibre_app_mac_path, "ebook-convert")
	rsvg_convert_path = shutil.which("rsvg-convert")
	convert_path = shutil.which("convert")
	navdoc2ncx_xsl_filename = resource_filename("se", os.path.join("data", "navdoc2ncx.xsl"))
	mathml_xsl_filename = resource_filename("se", os.path.join("data", "mathmlcontent2presentation.xsl"))

	# Check for some required tools
	if run_epubcheck and epubcheck_path is None:
		raise se.MissingDependencyException("Couldn’t locate epubcheck. Is it installed?")

	if rsvg_convert_path is None:
		raise se.MissingDependencyException("Couldn’t locate rsvg-convert. Is librsvg2-bin installed?")

	if build_kindle and ebook_convert_path is None:
		raise se.MissingDependencyException("Couldn’t locate ebook-convert. Is Calibre installed?")

	if build_kindle and convert_path is None:
		raise se.MissingDependencyException("Couldn’t locate convert. Is Imagemagick installed?")

	# Check the output directory and create it if it doesn't exist
	if output_directory is None:
		output_directory = os.getcwd()
	else:
		output_directory = output_directory

	output_directory = os.path.abspath(output_directory)

	if os.path.exists(output_directory):
		if not os.path.isdir(output_directory):
			raise se.InvalidInputException("Not a directory: {}".format(output_directory))
	else:
		# Doesn't exist, try to create it
		try:
			os.makedirs(output_directory)
		except OSError as exception:
			if exception.errno != errno.EEXIST:
				raise se.FileExistsException("Couldn’t create output directory.")

	# All clear to start building!
	if verbose:
		print("Building {} ...".format(self.directory))

	with tempfile.TemporaryDirectory() as work_directory:
		work_epub_root_directory = os.path.join(work_directory, "src")

		copy_tree(self.directory, work_directory)
		try:
			shutil.rmtree(os.path.join(work_directory, ".git"))
		except Exception:
			pass

		# By convention the ASIN is set to the SHA-1 sum of the book's identifying URL
		identifier = metadata_tree.xpath("//dc:identifier")[0].inner_html().replace("url:", "")
		asin = sha1(identifier.encode("utf-8")).hexdigest()

		title = metadata_tree.xpath("//dc:title")[0].inner_html()
		url_title = se.formatting.make_url_safe(title)

		url_author = ""
		for author in metadata_tree.xpath("//dc:creator"):
			url_author = url_author + se.formatting.make_url_safe(author.inner_html()) + "_"

		url_author = url_author.rstrip("_")

		epub_output_filename = "{}_{}{}.epub".format(url_author, url_title, ".proof" if proof else "")
		epub3_output_filename = "{}_{}{}.epub3".format(url_author, url_title, ".proof" if proof else "")
		kobo_output_filename = "{}_{}{}.kepub.epub".format(url_author, url_title, ".proof" if proof else "")
		kindle_output_filename = "{}_{}{}.azw3".format(url_author, url_title, ".proof" if proof else "")

		# Clean up old output files if any
		for kindle_thumbnail in glob.glob(os.path.join(output_directory, "thumbnail_{}_EBOK_portrait.jpg".format(asin))):
			se.quiet_remove(kindle_thumbnail)
		se.quiet_remove(os.path.join(output_directory, "cover.jpg"))
		se.quiet_remove(os.path.join(output_directory, "cover-thumbnail.jpg"))
		se.quiet_remove(os.path.join(output_directory, epub_output_filename))
		se.quiet_remove(os.path.join(output_directory, epub3_output_filename))
		se.quiet_remove(os.path.join(output_directory, kobo_output_filename))
		se.quiet_remove(os.path.join(output_directory, kindle_output_filename))

		# Are we including proofreading CSS?
		if proof:
			with open(os.path.join(work_epub_root_directory, "epub", "css", "local.css"), "a", encoding="utf-8") as local_css_file:
				with open(resource_filename("se", os.path.join("data", "templates", "proofreading.css")), "r", encoding="utf-8") as proofreading_css_file:
					local_css_file.write(proofreading_css_file.read())

		# Output the pure epub3 file
		if verbose:
			print("\tBuilding {} ...".format(epub3_output_filename), end="", flush=True)

		se.epub.write_epub(work_epub_root_directory, os.path.join(output_directory, epub3_output_filename))

		if verbose:
			print(" OK")

		if build_kobo:
			if verbose:
				print("\tBuilding {} ...".format(kobo_output_filename), end="", flush=True)
		else:
			if verbose:
				print("\tBuilding {} ...".format(epub_output_filename), end="", flush=True)

		# Now add epub2 compatibility.

		# Include compatibility CSS
		with open(os.path.join(work_epub_root_directory, "epub", "css", "core.css"), "a", encoding="utf-8") as core_css_file:
			with open(resource_filename("se", os.path.join("data", "templates", "compatibility.css")), "r", encoding="utf-8") as compatibility_css_file:
				core_css_file.write(compatibility_css_file.read())

		# Simplify CSS and tags
		total_css = ""

		# Simplify the CSS first.  Later we'll update the document to match our simplified selectors.
		# While we're doing this, we store the original css into a single variable so we can extract the original selectors later.
		for root, _, filenames in os.walk(work_epub_root_directory):
			for filename in fnmatch.filter(filenames, "*.css"):
				with open(os.path.join(root, filename), "r+", encoding="utf-8") as file:
					css = file.read()

					# Before we do anything, we process a special case in core.css
					if "core.css" in filename:
						css = regex.sub(r"abbr{.+?}", "", css, flags=regex.DOTALL)

					total_css = total_css + css + "\n"
					file.seek(0)
					file.write(se.formatting.simplify_css(css))
					file.truncate()

		# Now get a list of original selectors
		# Remove @supports(){}
		total_css = regex.sub(r"@supports.+?{(.+?)}\s*}", "\\1}", total_css, flags=regex.DOTALL)

		# Remove CSS rules
		total_css = regex.sub(r"{[^}]+}", "", total_css)

		# Remove trailing commas
		total_css = regex.sub(r",", "", total_css)

		# Remove comments
		total_css = regex.sub(r"/\*.+?\*/", "", total_css, flags=regex.DOTALL)

		# Remove @ defines
		total_css = regex.sub(r"^@.+", "", total_css, flags=regex.MULTILINE)

		# Construct a dictionary of the original selectors
		selectors = set([line for line in total_css.splitlines() if line != ""])

		# Get a list of .xhtml files to simplify
		for root, _, filenames in os.walk(work_epub_root_directory):
			for filename in fnmatch.filter(filenames, "*.xhtml"):
				# Don't mess with the ToC, since if we have ol/li > first-child selectors we could screw it up
				if filename == "toc.xhtml":
					continue

				with open(os.path.join(root, filename), "r+", encoding="utf-8") as file:
					# We have to remove the default namespace declaration from our document, otherwise
					# xpath won't find anything at all.  See http://stackoverflow.com/questions/297239/why-doesnt-xpath-work-when-processing-an-xhtml-document-with-lxml-in-python
					xhtml = file.read().replace(" xmlns=\"http://www.w3.org/1999/xhtml\"", "")
					processed_xhtml = xhtml
					try:
						tree = etree.fromstring(str.encode(xhtml))
					except Exception as ex:
						raise se.InvalidXhtmlException("Error parsing XHTML file: {}\n{}".format(filename, ex))

					# Now iterate over each CSS selector and see if it's used in any of the files we found
					force_convert = False
					for selector in selectors:
						try:
							sel = lxml.cssselect.CSSSelector(selector, translator="xhtml", namespaces=se.XHTML_NAMESPACES)

							# Add classes to elements that match any of our selectors to simplify. For example, if we select :first-child, add a "first-child" class to all elements that match that.
							for selector_to_simplify in se.SELECTORS_TO_SIMPLIFY:
								if selector_to_simplify in selector:
									selector_to_simplify = selector_to_simplify.replace(":", "")
									for element in tree.xpath(sel.path, namespaces=se.XHTML_NAMESPACES):
										current_class = element.get("class")
										if current_class is not None and selector_to_simplify not in current_class:
											current_class = current_class + " " + selector_to_simplify
										else:
											current_class = selector_to_simplify

										element.set("class", current_class)

						except lxml.cssselect.ExpressionError:
							# This gets thrown if we use pseudo-elements, which lxml doesn't support
							# We force a check if we get thrown this because we might miss some important ::before elements
							force_convert = True

						# We've already replaced attribute/namespace selectors with classes in the CSS, now add those classes to the matching elements
						if force_convert or "[epub|type" in selector:
							for namespace_selector in regex.findall(r"\[epub\|type\~\=\"[^\"]*?\"\]", selector):
								sel = lxml.cssselect.CSSSelector(namespace_selector, translator="xhtml", namespaces=se.XHTML_NAMESPACES)

								for element in tree.xpath(sel.path, namespaces=se.XHTML_NAMESPACES):
									new_class = regex.sub(r"^\.", "", se.formatting.namespace_to_class(namespace_selector))
									current_class = element.get("class", "")

									if new_class not in current_class:
										current_class = "{} {}".format(current_class, new_class).strip()
										element.set("class", current_class)

					processed_xhtml = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + etree.tostring(tree, encoding=str, pretty_print=True)

					# We do this round in a second pass because if we modify the tree like this, it screws up how lxml does processing later.
					# If it's all done in one pass, we wind up in a race condition where some elements are fixed and some not
					tree = etree.fromstring(str.encode(processed_xhtml))

					for selector in selectors:
						try:
							sel = lxml.cssselect.CSSSelector(selector, translator="xhtml", namespaces=se.XHTML_NAMESPACES)
						except lxml.cssselect.ExpressionError:
							# This gets thrown if we use pseudo-elements, which lxml doesn't support
							continue

						# Convert <abbr> to <span>
						if "abbr" in selector:
							for element in tree.xpath(sel.path, namespaces=se.XHTML_NAMESPACES):
								# Why would you want the tail to output by default?!?
								raw_string = etree.tostring(element, encoding=str, with_tail=False)

								# lxml--crap as usual--includes a bunch of namespace information in every element we print.
								# Remove it heregex.
								raw_string = raw_string.replace(" xmlns=\"http://www.w3.org/1999/xhtml\"", "")
								raw_string = raw_string.replace(" xmlns:epub=\"http://www.idpf.org/2007/ops\"", "")

								# Now lxml doesn't let us modify the tree, so we just do a straight up regex replace to turn this into a span
								processed_string = raw_string.replace("<abbr", "<span")
								processed_string = processed_string.replace("</abbr", "</span")

								# Now we have a nice, fixed string.  But, since lxml can't replace elements, we write it ourselves.
								processed_xhtml = processed_xhtml.replace(raw_string, processed_string)

								tree = etree.fromstring(str.encode(processed_xhtml))

					# Now we just remove all stray abbr tags that were not styled by CSS
					processed_xhtml = regex.sub(r"</?abbr[^>]*?>", "", processed_xhtml)

					# Remove datetime="" attribute in <time> tags, which is not always understood by epubcheck
					processed_xhtml = regex.sub(r" datetime=\"[^\"]+?\"", "", processed_xhtml)

					tree = etree.fromstring(str.encode(processed_xhtml))

					if processed_xhtml != xhtml:
						file.seek(0)
						file.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + etree.tostring(tree, encoding=str, pretty_print=True).replace("<html", "<html xmlns=\"http://www.w3.org/1999/xhtml\""))
						file.truncate()

		# Done simplifying CSS and tags!

		# Extract cover and cover thumbnail
		# We used to be able to use `convert` to convert svg -> jpg in one step, but at some point a bug
		# was introduced to `convert` that caused it to crash in this situation. Now, we first use rsvg-convert
		# to convert to svg -> png, then `convert` to convert png -> jpg.
		subprocess.run([rsvg_convert_path, "--keep-aspect-ratio", "--format", "png", "--output", os.path.join(work_directory, 'cover.png'), os.path.join(work_epub_root_directory, "epub", "images", "cover.svg")])
		subprocess.run([convert_path, "-format", "jpg", os.path.join(work_directory, 'cover.png'), os.path.join(work_epub_root_directory, "epub", "images", "cover.jpg")])
		os.remove(os.path.join(work_directory, 'cover.png'))

		if build_covers:
			shutil.copy2(os.path.join(work_epub_root_directory, "epub", "images", "cover.jpg"), os.path.join(output_directory, "cover.jpg"))
			shutil.copy2(os.path.join(work_epub_root_directory, "epub", "images", "cover.svg"), os.path.join(output_directory, "cover-thumbnail.svg"))
			subprocess.run([rsvg_convert_path, "--keep-aspect-ratio", "--format", "png", "--output", os.path.join(work_directory, 'cover-thumbnail.png'), os.path.join(output_directory, "cover-thumbnail.svg")])
			subprocess.run([convert_path, "-resize", "{}x{}".format(COVER_THUMBNAIL_WIDTH, COVER_THUMBNAIL_HEIGHT), "-quality", "100", "-format", "jpg", os.path.join(work_directory, 'cover-thumbnail.png'), os.path.join(output_directory, "cover-thumbnail.jpg")])
			os.remove(os.path.join(work_directory, 'cover-thumbnail.png'))
			os.remove(os.path.join(output_directory, "cover-thumbnail.svg"))

		os.remove(os.path.join(work_epub_root_directory, "epub", "images", "cover.svg"))

		# Massage image references in content.opf
		metadata_xhtml = metadata_xhtml.replace("cover.svg", "cover.jpg")
		metadata_xhtml = metadata_xhtml.replace(".svg", ".png")
		metadata_xhtml = metadata_xhtml.replace("id=\"cover.jpg\" media-type=\"image/svg+xml\"", "id=\"cover.jpg\" media-type=\"image/jpeg\"")
		metadata_xhtml = metadata_xhtml.replace("image/svg+xml", "image/png")
		metadata_xhtml = regex.sub(r"properties=\"([^\"]*?)svg([^\"]*?)\"", "properties=\"\\1\\2\"", metadata_xhtml) # We may also have the `mathml` property

		# NOTE: even though the a11y namespace is reserved by the epub spec, we must declare it because epubcheck doesn't know that yet.
		# Once epubcheck understands the a11y namespace is reserved, we can remove it from the namespace declarations.
		metadata_xhtml = metadata_xhtml.replace(" prefix=\"se: https://standardebooks.org/vocab/1.0\"", " prefix=\"se: https://standardebooks.org/vocab/1.0, a11y: https://www.idpf.org/epub/vocab/package/a11y/\"")

		# Google Play Books chokes on https XML namespace identifiers (as of at least 2017-07)
		metadata_xhtml = metadata_xhtml.replace("https://standardebooks.org/vocab/1.0", "http://standardebooks.org/vocab/1.0")
		metadata_xhtml = metadata_xhtml.replace("https://www.idpf.org/epub/vocab/package/a11y/", "http://www.idpf.org/epub/vocab/package/a11y/")

		# Output the modified content.opf so that we can build the kobo book before making more epub2 compatibility hacks
		with open(os.path.join(work_epub_root_directory, "epub", "content.opf"), "w", encoding="utf-8") as file:
			file.write(metadata_xhtml)
			file.truncate()

		# Recurse over xhtml files to make some compatibility replacements
		for root, _, filenames in os.walk(work_epub_root_directory):
			for filename in filenames:
				if filename.lower().endswith(".svg"):
					# For night mode compatibility, give the titlepage a 1px white stroke attribute
					if filename.lower() == "titlepage.svg" or filename.lower() == "logo.svg":
						with open(os.path.join(root, filename), "r+", encoding="utf-8") as file:
							svg = file.read()
							paths = svg

							# What we're doing here is faking the `stroke-align: outside` property, which is an unsupported draft spec right now.
							# We do this by duplicating all the SVG paths, and giving the duplicates a 2px stroke.  The originals are directly on top,
							# so the 2px stroke becomes a 1px stroke that's *outside* of the path instead of being *centered* on the path border.
							# This looks much nicer, but we also have to increase the image size by 2px in both directions, and re-center the whole thing.

							if filename.lower() == "titlepage.svg":
								stroke_width = SVG_TITLEPAGE_OUTER_STROKE_WIDTH
							else:
								stroke_width = SVG_OUTER_STROKE_WIDTH

							# First, strip out non-path, non-group elements
							paths = regex.sub(r"<\?xml[^<]+?\?>", "", paths)
							paths = regex.sub(r"</?svg[^<]*?>", "", paths)
							paths = regex.sub(r"<title>[^<]+?</title>", "", paths)
							paths = regex.sub(r"<desc>[^<]+?</desc>", "", paths)

							# `paths` is now our "duplicate".  Add a 2px stroke.
							paths = paths.replace("<path", "<path style=\"stroke: #ffffff; stroke-width: {}px;\"".format(stroke_width))

							# Inject the duplicate under the old SVG paths.  We do this by only replacing the first regex match for <g> or <path>
							svg = regex.sub(r"(<g|<path)", "{}\\1".format(paths), svg, 1)

							# If this SVG specifies height/width, then increase height and width by 2 pixels and translate everything by 1px
							try:
								height = int(regex.search(r"<svg[^>]+?height=\"([0-9]+)\"", svg).group(1)) + stroke_width
								svg = regex.sub(r"<svg([^<]*?)height=\"[0-9]+\"", "<svg\\1height=\"{}\"".format(height), svg)

								width = int(regex.search(r"<svg[^>]+?width=\"([0-9]+)\"", svg).group(1)) + stroke_width
								svg = regex.sub(r"<svg([^<]*?)width=\"[0-9]+\"", "<svg\\1width=\"{}\"".format(width), svg)

								# Add a grouping element to translate everything over 1px
								svg = regex.sub(r"(<g|<path)", "<g transform=\"translate({amount}, {amount})\">\n\\1".format(amount=(stroke_width / 2)), svg, 1)
								svg = svg.replace("</svg>", "</g>\n</svg>")
							except AttributeError:
								# Thrown when the regex doesn't match (i.e. SVG doesn't specify height/width)
								pass

							file.seek(0)
							file.write(svg)
							file.truncate()

					# Convert SVGs to PNGs at 2x resolution
					# We use `rsvg-convert` instead of `inkscape` or `convert` because it gives us an easy way of zooming in at 2x
					subprocess.run([rsvg_convert_path, "--zoom", "2", "--keep-aspect-ratio", "--format", "png", "--output", regex.sub(r"\.svg$", ".png", os.path.join(root, filename)), os.path.join(root, filename)])
					os.remove(os.path.join(root, filename))

				if filename.lower().endswith(".xhtml"):
					with open(os.path.join(root, filename), "r+", encoding="utf-8") as file:
						xhtml = file.read()
						processed_xhtml = xhtml

						# Check if there's any MathML to convert.
						# We expect MathML to be the "content" type (versus the "presentational" type).
						# We use an XSL transform to convert from "content" to "presentational" MathML.
						# If we start with presentational, then nothing will be changed.
						# Kobo supports presentational MathML. After we build kobo, we convert the presentational MathML to PNG for the rest of the builds.
						mathml_transform = None
						for line in regex.findall(r"<(?:m:)?math[^>]*?>(.+?)</(?:m:)?math>", processed_xhtml, flags=regex.DOTALL):
							mathml_content_tree = se.easy_xml.EasyXmlTree("<?xml version=\"1.0\" encoding=\"utf-8\"?><math xmlns=\"http://www.w3.org/1998/Math/MathML\">{}</math>".format(regex.sub(r"<(/?)m:", "<\\1", line)))

							# Initialize the transform object, if we haven't yet
							if not mathml_transform:
								mathml_transform = etree.XSLT(etree.parse(mathml_xsl_filename))

							# Transform the mathml and get a string representation
							# XSLT comes from https://github.com/fred-wang/webextension-content-mathml-polyfill
							mathml_presentation_tree = mathml_transform(mathml_content_tree.etree)
							mathml_presentation_xhtml = etree.tostring(mathml_presentation_tree, encoding="unicode", pretty_print=True, with_tail=False).strip()

							# Plop our string back in to the XHTML we're processing
							processed_xhtml = regex.sub(r"<math[^>]*?>\{}\</math>".format(regex.escape(line)), mathml_presentation_xhtml, processed_xhtml, flags=regex.MULTILINE)

						# Add ARIA roles, which are just mostly duplicate attributes to epub:type (with the exception of rearnotes -> endnotes, and adding the `backlink` role which is not yet in epub 3.0)
						processed_xhtml = regex.sub(r"(epub:type=\"[^\"]*?rearnote(s?)[^\"]*?\")", "\\1 role=\"doc-endnote\\2\"", processed_xhtml)

						if filename == "endnotes.xhtml":
							processed_xhtml = processed_xhtml.replace(" epub:type=\"se:referrer\"", " role=\"doc-backlink\" epub:type=\"se:referrer\"")

							# iOS renders the left-arrow-hook character as an emoji; this fixes it and forces it to renderr as text.
							# See https://github.com/standardebooks/tools/issues/73
							# See http://mts.io/2015/04/21/unicode-symbol-render-text-emoji/
							processed_xhtml = processed_xhtml.replace("\u21a9", "\u21a9\ufe0e")

						for role in se.ARIA_ROLES:
							processed_xhtml = regex.sub(r"(epub:type=\"[^\"]*?{}[^\"]*?\")".format(role), "\\1 role=\"doc-{}\"".format(role), processed_xhtml)

						# Since we convert SVGs to raster, here we add the color-depth semantic for night mode
						processed_xhtml = processed_xhtml.replace("z3998:publisher-logo", "z3998:publisher-logo se:image.color-depth.black-on-transparent")
						processed_xhtml = regex.sub(r"class=\"([^\"]*?)epub-type-z3998-publisher-logo([^\"]*?)\"", "class=\"\\1epub-type-z3998-publisher-logo epub-type-se-image-color-depth-black-on-transparent\\2\"", processed_xhtml)

						# Special case for the titlepage
						if filename == "titlepage.xhtml":
							processed_xhtml = processed_xhtml.replace("<img", "<img class=\"epub-type-se-image-color-depth-black-on-transparent\" epub:type=\"se:image.color-depth.black-on-transparent\"")

						# Google Play Books chokes on https XML namespace identifiers (as of at least 2017-07)
						processed_xhtml = processed_xhtml.replace("https://standardebooks.org/vocab/1.0", "http://standardebooks.org/vocab/1.0")

						# We converted svgs to pngs, so replace references
						processed_xhtml = processed_xhtml.replace("cover.svg", "cover.jpg")
						processed_xhtml = processed_xhtml.replace(".svg", ".png")

						# To get popup footnotes in iBooks, we have to change epub:rearnote to epub:footnote.
						# Remember to get our custom style selectors too.
						processed_xhtml = regex.sub(r"epub:type=\"([^\"]*?)rearnote([^\"]*?)\"", "epub:type=\"\\1footnote\\2\"", processed_xhtml)
						processed_xhtml = regex.sub(r"class=\"([^\"]*?)epub-type-rearnote([^\"]*?)\"", "class=\"\\1epub-type-footnote\\2\"", processed_xhtml)

						# Include extra lang tag for accessibility compatibility.
						processed_xhtml = regex.sub(r"xml:lang\=\"([^\"]+?)\"", "lang=\"\\1\" xml:lang=\"\\1\"", processed_xhtml)

						# Typography: replace double and triple em dash characters with extra em dashes.
						processed_xhtml = processed_xhtml.replace("⸺", "—{}—".format(se.WORD_JOINER))
						processed_xhtml = processed_xhtml.replace("⸻", "—{}—{}—".format(se.WORD_JOINER, se.WORD_JOINER))

						# Typography: replace some other less common characters.
						processed_xhtml = processed_xhtml.replace("⅒", "1/10")
						processed_xhtml = processed_xhtml.replace("℅", "c/o")
						processed_xhtml = processed_xhtml.replace("✗", "×")
						processed_xhtml = processed_xhtml.replace(" ", "{}{}".format(se.NO_BREAK_SPACE, se.NO_BREAK_SPACE)) # em-space to two nbsps

						# Many e-readers don't support the word joiner character (U+2060).
						# They DO, however, support the now-deprecated zero-width non-breaking space (U+FEFF)
						# For epubs, do this replacement.  Kindle now seems to handle everything fortunately.
						processed_xhtml = processed_xhtml.replace(se.WORD_JOINER, se.ZERO_WIDTH_SPACE)

						if processed_xhtml != xhtml:
							file.seek(0)
							file.write(processed_xhtml)
							file.truncate()

				if filename.lower().endswith(".css"):
					with open(os.path.join(root, filename), "r+", encoding="utf-8") as file:
						css = file.read()
						processed_css = css

						# To get popup footnotes in iBooks, we have to change epub:rearnote to epub:footnote.
						# Remember to get our custom style selectors too.
						processed_css = processed_css.replace("rearnote", "footnote")

						# Add new break-* aliases for compatibilty with newer readers.
						processed_css = regex.sub(r"(\s+)page-break-(.+?:\s.+?;)", "\\1page-break-\\2\t\\1break-\\2", processed_css)

						if processed_css != css:
							file.seek(0)
							file.write(processed_css)
							file.truncate()

		if build_kobo:
			with tempfile.TemporaryDirectory() as kobo_work_directory:
				copy_tree(work_epub_root_directory, kobo_work_directory)

				for root, _, filenames in os.walk(kobo_work_directory):
					# Add a note to content.opf indicating this is a transform build
					for filename in fnmatch.filter(filenames, "content.opf"):
						with open(os.path.join(root, filename), "r+", encoding="utf-8") as file:
							xhtml = file.read()

							xhtml = regex.sub(r"<dc:publisher", "<meta property=\"se:transform\">kobo</meta>\n\t\t<dc:publisher", xhtml)

							file.seek(0)
							file.write(xhtml)
							file.truncate()

					# Kobo .kepub files need each clause wrapped in a special <span> tag to enable highlighting.
					# Do this here. Hopefully Kobo will get their act together soon and drop this requirement.
					for filename in fnmatch.filter(filenames, "*.xhtml"):
						se.kobo.paragraph_counter = 1
						se.kobo.segment_counter = 1

						# Don't add spans to the ToC
						if filename == "toc.xhtml":
							continue

						with open(os.path.join(root, filename), "r+", encoding="utf-8") as file:
							xhtml = file.read()
							# Kobos don't have fonts that support the ↩ character in endnotes, so replace it with «
							if filename == "endnotes.xhtml":
								# Note that we replaced ↩ with \u21a9\ufe0e in an earlier iOS compatibility fix
								xhtml = regex.sub(r"epub:type=\"se:referrer\">\u21a9\ufe0e</a>", "epub:type=\"se:referrer\">«</a>", xhtml)

							# We have to remove the default namespace declaration from our document, otherwise
							# xpath won't find anything at all.  See http://stackoverflow.com/questions/297239/why-doesnt-xpath-work-when-processing-an-xhtml-document-with-lxml-in-python
							try:
								tree = etree.fromstring(str.encode(xhtml.replace(" xmlns=\"http://www.w3.org/1999/xhtml\"", "")))
							except Exception as ex:
								raise se.InvalidXhtmlException("Error parsing XHTML file: {}\n{}".format(filename, ex), verbose)

							se.kobo.add_kobo_spans_to_node(tree.xpath("./body", namespaces=se.XHTML_NAMESPACES)[0])

							xhtml = etree.tostring(tree, encoding="unicode", pretty_print=True, with_tail=False)
							xhtml = regex.sub(r"<html:span", "<span", xhtml)
							xhtml = regex.sub(r"html:span>", "span>", xhtml)
							xhtml = regex.sub(r"<span xmlns:html=\"http://www.w3.org/1999/xhtml\"", "<span", xhtml)
							xhtml = regex.sub(r"<html", "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\"", xhtml)

							file.seek(0)
							file.write(xhtml)
							file.truncate()

				se.epub.write_epub(kobo_work_directory, os.path.join(output_directory, kobo_output_filename))

			if verbose:
				print(" OK")
				print("\tBuilding {} ...".format(epub_output_filename), end="", flush=True)

		# Now work on more epub2 compatibility

		# Recurse over css files to make some compatibility replacements.
		for root, _, filenames in os.walk(work_epub_root_directory):
			for filename in filenames:
				if filename.lower().endswith(".css"):
					with open(os.path.join(root, filename), "r+", encoding="utf-8") as file:
						css = file.read()
						processed_css = css

						processed_css = regex.sub(r"(page\-break\-(before|after|inside)\s*:\s*(.+))", "\\1\n\t-webkit-column-break-\\2: \\3 /* For Readium */", processed_css)
						processed_css = regex.sub(r"^\s*hyphens\s*:\s*(.+)", "\thyphens: \\1\n\tadobe-hyphenate: \\1\n\t-webkit-hyphens: \\1\n\t-epub-hyphens: \\1\n\t-moz-hyphens: \\1", processed_css, flags=regex.MULTILINE)
						processed_css = regex.sub(r"^\s*hyphens\s*:\s*none;", "\thyphens: none;\n\tadobe-text-layout: optimizeSpeed; /* For Nook */", processed_css, flags=regex.MULTILINE)

						if processed_css != css:
							file.seek(0)
							file.write(processed_css)
							file.truncate()

		# Sort out MathML compatibility
		has_mathml = "mathml" in metadata_xhtml
		if has_mathml:
			firefox_path = shutil.which("firefox")
			if firefox_path is None:
				raise se.MissingDependencyException("firefox is required to process MathML, but firefox couldn't be located. Is it installed?")

			mathml_count = 1
			for root, _, filenames in os.walk(work_epub_root_directory):
				for filename in filenames:
					if filename.lower().endswith(".xhtml"):
						with open(os.path.join(root, filename), "r+", encoding="utf-8") as file:
							xhtml = file.read()
							processed_xhtml = xhtml
							replaced_mathml = []

							# Check if there's MathML we want to convert
							# We take a naive approach and use some regexes to try to simplify simple MathML expressions.
							# For each MathML expression, if our round of regexes finishes and there is still MathML in the processed result, we abandon the attempt and render to PNG using Firefox.
							for line in regex.findall(r"<(?:m:)math[^>]*?>(?:.+?)</(?:m:)math>", processed_xhtml, flags=regex.DOTALL):
								if line not in replaced_mathml:
									replaced_mathml.append(line) # Store converted lines to save time in case we have multiple instances of the same MathML
									mathml_tree = se.easy_xml.EasyXmlTree("<?xml version=\"1.0\" encoding=\"utf-8\"?>{}".format(regex.sub(r"<(/?)m:", "<\\1", line)))
									processed_line = line

									# If the mfenced element has more than one child, they are separated by commas when rendered.
									# This is too complex for our naive regexes to work around. So, if there is an mfenced element with more than one child, abandon the attempt.
									if not mathml_tree.css_select("mfenced > * + *"):
										processed_line = regex.sub(r"</?(?:m:)?math[^>]*?>", "", processed_line)
										processed_line = regex.sub(r"<!--.+?-->", "", processed_line)
										processed_line = regex.sub(r"<(?:m:)?mfenced/>", "()", processed_line)
										processed_line = regex.sub(r"<((?:m:)?m(sub|sup))><((?:m:)?mi)>(.+?)</\3><((?:m:)?mi)>(.+?)</\5></\1>", "<i>\\4</i><\\2><i>\\6</i></\\2>", processed_line)
										processed_line = regex.sub(r"<((?:m:)?m(sub|sup))><((?:m:)?mi)>(.+?)</\3><((?:m:)?mn)>(.+?)</\5></\1>", "<i>\\4</i><\\2>\\6</\\2>", processed_line)
										processed_line = regex.sub(r"<((?:m:)?m(sub|sup))><((?:m:)?mn)>(.+?)</\3><((?:m:)?mn)>(.+?)</\5></\1>", "\\4<\\2>\\6</\\2>", processed_line)
										processed_line = regex.sub(r"<((?:m:)?m(sub|sup))><((?:m:)?mn)>(.+?)</\3><((?:m:)?mi)>(.+?)</\5></\1>", "\\4<\\2><i>\\6</i></\\2>", processed_line)
										processed_line = regex.sub(r"<((?:m:)?m(sub|sup))><((?:m:)?mi) mathvariant=\"normal\">(.+?)</\3><((?:m:)?mi)>(.+?)</\5></\1>", "\\4<\\2><i>\\6</i></\\2>", processed_line)
										processed_line = regex.sub(r"<((?:m:)?m(sub|sup))><((?:m:)?mi) mathvariant=\"normal\">(.+?)</\3><((?:m:)?mn)>(.+?)</\5></\1>", "\\4<\\2>\\6</\\2>", processed_line)
										processed_line = regex.sub(r"<(?:m:)?mo>{}</(?:m:)?mo>".format(se.FUNCTION_APPLICATION), "", processed_line, flags=regex.IGNORECASE) # The ignore case flag is required to match here with the special FUNCTION_APPLICATION character, it's unclear why
										processed_line = regex.sub(r"<(?:m:)?mfenced><((?:m:)(?:mo|mi|mn|mrow))>(.+?)</\1></(?:m:)?mfenced>", "(<\\1>\\2</\\1>)", processed_line)
										processed_line = regex.sub(r"<(?:m:)?mrow>([^>].+?)</(?:m:)?mrow>", "\\1", processed_line)
										processed_line = regex.sub(r"<(?:m:)?mi>([^<]+?)</(?:m:)?mi>", "<i>\\1</i>", processed_line)
										processed_line = regex.sub(r"<(?:m:)?mi mathvariant=\"normal\">([^<]+?)</(?:m:)?mi>", "\\1", processed_line)
										processed_line = regex.sub(r"<(?:m:)?mo>([+\-−=×])</(?:m:)?mo>", " \\1 ", processed_line)
										processed_line = regex.sub(r"<((?:m:)?m[no])>(.+?)</\1>", "\\2", processed_line)
										processed_line = regex.sub(r"</?(?:m:)?mrow>", "", processed_line)
										processed_line = processed_line.strip()
										processed_line = regex.sub(r"</i><i>", "", processed_line, flags=regex.DOTALL)

									# Did we succeed? Is there any more MathML in our string?
									if regex.findall("</?(?:m:)?m", processed_line):
										# Failure! Abandon all hope, and use Firefox to convert the MathML to PNG.
										se.images.render_mathml_to_png(regex.sub(r"<(/?)m:", "<\\1", line), os.path.join(work_epub_root_directory, "epub", "images", "mathml-{}.png".format(mathml_count)))

										processed_xhtml = processed_xhtml.replace(line, "<img class=\"mathml epub-type-se-image-color-depth-black-on-transparent\" epub:type=\"se:image.color-depth.black-on-transparent\" src=\"../images/mathml-{}.png\" />".format(mathml_count))
										mathml_count = mathml_count + 1
									else:
										# Success! Replace the MathML with our new string.
										processed_xhtml = processed_xhtml.replace(line, processed_line)

							if processed_xhtml != xhtml:
								file.seek(0)
								file.write(processed_xhtml)
								file.truncate()

		# Include epub2 cover metadata
		cover_id = metadata_tree.xpath("//opf:item[@properties=\"cover-image\"]/@id")[0].replace(".svg", ".jpg")
		metadata_xhtml = regex.sub(r"(<metadata[^>]+?>)", "\\1\n\t\t<meta content=\"{}\" name=\"cover\" />".format(cover_id), metadata_xhtml)

		# Add metadata to content.opf indicating this file is a Standard Ebooks compatibility build
		metadata_xhtml = metadata_xhtml.replace("<dc:publisher", "<meta property=\"se:transform\">compatibility</meta>\n\t\t<dc:publisher")

		# Add any new MathML images we generated to the manifest
		if has_mathml:
			for root, _, filenames in os.walk(os.path.join(work_epub_root_directory, "epub", "images")):
				filenames = se.natural_sort(filenames)
				filenames.reverse()
				for filename in filenames:
					if filename.lower().startswith("mathml-"):
						metadata_xhtml = metadata_xhtml.replace("<manifest>", "<manifest><item href=\"images/{}\" id=\"{}\" media-type=\"image/png\"/>".format(filename, filename))

			metadata_xhtml = regex.sub(r"properties=\"([^\"]*?)mathml([^\"]*?)\"", "properties=\"\\1\\2\"", metadata_xhtml)

		metadata_xhtml = regex.sub(r"properties=\"\s*\"", "", metadata_xhtml)

		# Generate our NCX file for epub2 compatibility.
		# First find the ToC file.
		toc_filename = metadata_tree.xpath("//opf:item[@properties=\"nav\"]/@href")[0]
		metadata_xhtml = metadata_xhtml.replace("<spine>", "<spine toc=\"ncx\">")
		metadata_xhtml = metadata_xhtml.replace("<manifest>", "<manifest><item href=\"toc.ncx\" id=\"ncx\" media-type=\"application/x-dtbncx+xml\" />")

		# Now use an XSLT transform to generate the NCX
		toc_tree = se.epub.convert_toc_to_ncx(work_epub_root_directory, toc_filename, navdoc2ncx_xsl_filename)

		# Convert the <nav> landmarks element to the <guide> element in content.opf
		guide_xhtml = "<guide>"
		for element in toc_tree.xpath("//xhtml:nav[@epub:type=\"landmarks\"]/xhtml:ol/xhtml:li/xhtml:a"):
			element_xhtml = element.tostring()
			element_xhtml = regex.sub(r"epub:type=\"([^\"]*)(\s*frontmatter\s*|\s*backmatter\s*)([^\"]*)\"", "type=\"\\1\\3\"", element_xhtml)
			element_xhtml = regex.sub(r"epub:type=\"[^\"]*(acknowledgements|bibliography|colophon|copyright-page|cover|dedication|epigraph|foreword|glossary|index|loi|lot|notes|preface|bodymatter|titlepage|toc)[^\"]*\"", "type=\"\\1\"", element_xhtml)
			element_xhtml = element_xhtml.replace("type=\"copyright-page", "type=\"copyright page")

			# We add the 'text' attribute to the titlepage to tell the reader to start there
			element_xhtml = element_xhtml.replace("type=\"titlepage", "type=\"title-page text")

			element_xhtml = regex.sub(r"type=\"\s*\"", "", element_xhtml)
			element_xhtml = element_xhtml.replace("<a", "<reference")
			element_xhtml = regex.sub(r">(.+)</a>", " title=\"\\1\" />", element_xhtml)

			# Replace instances of the `role` attribute since it's illegal in content.opf
			element_xhtml = regex.sub(r" role=\".*?\"", "", element_xhtml)

			guide_xhtml = guide_xhtml + element_xhtml

		guide_xhtml = guide_xhtml + "</guide>"

		metadata_xhtml = metadata_xhtml.replace("</package>", "") + guide_xhtml + "</package>"

		# Guide is done, now write content.opf and clean it.
		# Output the modified content.opf before making more epub2 compatibility hacks.
		with open(os.path.join(work_epub_root_directory, "epub", "content.opf"), "w", encoding="utf-8") as file:
			file.write(metadata_xhtml)
			file.truncate()

		# All done, clean the output
		for filename in se.get_target_filenames([work_epub_root_directory], (".xhtml", ".svg", ".opf", ".ncx")):
			se.formatting.format_xhtml_file(filename, False, filename.endswith("content.opf"), filename.endswith("endnotes.xhtml"))

		# Write the compatible epub
		se.epub.write_epub(work_epub_root_directory, os.path.join(output_directory, epub_output_filename))

		if verbose:
			print(" OK")

		if run_epubcheck:
			if verbose:
				print("\tRunning epubcheck on {} ...".format(epub_output_filename), end="", flush=True)

			output = subprocess.run([epubcheck_path, "--quiet", os.path.join(output_directory, epub_output_filename)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT).stdout.decode().strip()

			# epubcheck on Ubuntu 18.04 outputs some seemingly harmless warnings; flush them here.
			if output:
				output = regex.sub(r"\s*Warning at char 3 in xsl:param/@select on line.+", "", output)
				output = regex.sub(r"\s*SXWN9000: The parent axis starting at a document node will never select anything", "", output)

			if output:
				if verbose:
					print("\n\t\t" + "\t\t".join(output.splitlines(True)), file=sys.stderr)
				else:
					print(output, file=sys.stderr)
				return

			if verbose:
				print(" OK")


		if build_kindle:
			if verbose:
				print("\tBuilding {} ...".format(kindle_output_filename), end="", flush=True)

			# Kindle doesn't go more than 2 levels deep for ToC, so flatten it here.
			with open(os.path.join(work_epub_root_directory, "epub", toc_filename), "r+", encoding="utf-8") as file:
				xhtml = file.read()

				soup = BeautifulSoup(xhtml, "lxml")

				for match in soup.select("ol > li > ol > li > ol"):
					match.unwrap()

				xhtml = str(soup)

				pattern = regex.compile(r"(<li>\s*<a href=\"[^\"]+?\">.+?</a>\s*)<li>")
				matches = 1
				while matches > 0:
					xhtml, matches = pattern.subn(r"\1</li><li>", xhtml)

				pattern = regex.compile(r"</li>\s*</li>")
				matches = 1
				while matches > 0:
					xhtml, matches = pattern.subn("</li>", xhtml)

				file.seek(0)
				file.write(xhtml)
				file.truncate()

			# Rebuild the NCX
			toc_tree = se.epub.convert_toc_to_ncx(work_epub_root_directory, toc_filename, navdoc2ncx_xsl_filename)

			# Clean just the ToC and NCX
			for filename in [os.path.join(work_epub_root_directory, "epub", "toc.ncx"), os.path.join(work_epub_root_directory, "epub", toc_filename)]:
				se.formatting.format_xhtml_file(filename, False)

			# Convert endnotes to Kindle popup compatible notes
			if os.path.isfile(os.path.join(work_epub_root_directory, "epub", "text", "endnotes.xhtml")):
				with open(os.path.join(work_epub_root_directory, "epub", "text", "endnotes.xhtml"), "r+", encoding="utf-8") as file:
					xhtml = file.read()

					# We have to remove the default namespace declaration from our document, otherwise
					# xpath won't find anything at all.  See http://stackoverflow.com/questions/297239/why-doesnt-xpath-work-when-processing-an-xhtml-document-with-lxml-in-python
					try:
						tree = etree.fromstring(str.encode(xhtml.replace(" xmlns=\"http://www.w3.org/1999/xhtml\"", "")))
					except Exception as ex:
						raise se.InvalidXhtmlException("Error parsing XHTML file: endnotes.xhtml\n{}".format(ex))

					notes = tree.xpath("//li[@epub:type=\"rearnote\" or @epub:type=\"footnote\"]", namespaces=se.XHTML_NAMESPACES)

					processed_endnotes = ""

					for note in notes:
						note_id = note.get("id")
						note_number = note_id.replace("note-", "")

						# First, fixup the reference link for this endnote
						try:
							ref_link = etree.tostring(note.xpath("p[last()]/a[last()]")[0], encoding="unicode", pretty_print=True, with_tail=False).replace(" xmlns:epub=\"http://www.idpf.org/2007/ops\"", "").strip()
						except Exception:
							raise se.InvalidXhtmlException("Can’t find ref link for #{}.".format(note_id))

						new_ref_link = regex.sub(r">.*?</a>", ">" + note_number + "</a>.", ref_link)

						# Now remove the wrapping li node from the note
						note_text = regex.sub(r"^<li[^>]*?>(.*)</li>$", r"\1", etree.tostring(note, encoding="unicode", pretty_print=True, with_tail=False), flags=regex.IGNORECASE | regex.DOTALL)

						# Insert our new ref link
						result = regex.subn(r"^\s*<p([^>]*?)>", "<p\\1 id=\"" + note_id + "\">" + new_ref_link + " ", note_text)

						# Sometimes there is no leading <p> tag (for example, if the endnote starts with a blockquote
						# If that's the case, just insert one in front.
						note_text = result[0]
						if result[1] == 0:
							note_text = "<p id=\"" + note_id + "\">" + new_ref_link + "</p>" + note_text

						# Now remove the old ref_link
						note_text = note_text.replace(ref_link, "")

						# Trim trailing spaces left over after removing the ref link
						note_text = regex.sub(r"\s+</p>", "</p>", note_text).strip()

						# Sometimes ref links are in their own p tag--remove that too
						note_text = regex.sub(r"<p>\s*</p>", "", note_text)

						processed_endnotes += note_text + "\n"

					# All done with endnotes, so drop them back in
					xhtml = regex.sub(r"<ol>.*</ol>", processed_endnotes, xhtml, flags=regex.IGNORECASE | regex.DOTALL)

					file.seek(0)
					file.write(xhtml)
					file.truncate()

				# While Kindle now supports soft hyphens, popup endnotes break words but don't insert the hyphen characters.  So for now, remove soft hyphens from the endnotes file.
				with open(os.path.join(work_epub_root_directory, "epub", "text", "endnotes.xhtml"), "r+", encoding="utf-8") as file:
					xhtml = file.read()
					processed_xhtml = xhtml

					processed_xhtml = processed_xhtml.replace(se.SHY_HYPHEN, "")

					if processed_xhtml != xhtml:
						file.seek(0)
						file.write(processed_xhtml)
						file.truncate()

			# Do some compatibility replacements
			for root, _, filenames in os.walk(work_epub_root_directory):
				for filename in filenames:
					if filename.lower().endswith(".xhtml"):
						with open(os.path.join(root, filename), "r+", encoding="utf-8") as file:
							xhtml = file.read()
							processed_xhtml = xhtml

							# Kindle doesn't recognize most zero-width spaces or word joiners, so just remove them.
							# It does recognize the word joiner character, but only in the old mobi7 format.  The new format renders them as spaces.
							processed_xhtml = processed_xhtml.replace(se.ZERO_WIDTH_SPACE, "")

							# Remove the epub:type attribute, as Calibre turns it into just "type"
							processed_xhtml = regex.sub(r"epub:type=\"[^\"]*?\"", "", processed_xhtml)

							if processed_xhtml != xhtml:
								file.seek(0)
								file.write(processed_xhtml)
								file.truncate()

			# Include compatibility CSS
			with open(os.path.join(work_epub_root_directory, "epub", "css", "core.css"), "a", encoding="utf-8") as core_css_file:
				with open(resource_filename("se", os.path.join("data", "templates", "kindle.css")), "r", encoding="utf-8") as compatibility_css_file:
					core_css_file.write(compatibility_css_file.read())

			# Add soft hyphens
			for filename in se.get_target_filenames([work_epub_root_directory], (".xhtml")):
				se.typography.hyphenate_file(filename, None, True)

			# Build an epub file we can send to Calibre
			se.epub.write_epub(work_epub_root_directory, os.path.join(work_directory, epub_output_filename))

			# Generate the Kindle file
			# We place it in the work directory because later we have to update the asin, and the se.mobi.update_asin() function will write to the final output directory
			cover_path = os.path.join(work_epub_root_directory, "epub", metadata_tree.xpath("//opf:item[@properties=\"cover-image\"]/@href")[0].replace(".svg", ".jpg"))
			return_code = subprocess.run([ebook_convert_path, os.path.join(work_directory, epub_output_filename), os.path.join(work_directory, kindle_output_filename), "--pretty-print", "--no-inline-toc", "--max-toc-links=0", "--prefer-metadata-cover", "--cover={}".format(cover_path)], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL).returncode

			if return_code:
				raise se.InvalidSeEbookException("ebook-convert failed.")
			else:
				# Success, extract the Kindle cover thumbnail

				# Update the ASIN in the generated file
				se.mobi.update_asin(asin, os.path.join(work_directory, kindle_output_filename), os.path.join(output_directory, kindle_output_filename))

				# Extract the thumbnail
				subprocess.run([convert_path, os.path.join(work_epub_root_directory, "epub", "images", "cover.jpg"), "-resize", "432x660", os.path.join(output_directory, "thumbnail_{}_EBOK_portrait.jpg".format(asin))], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

			if verbose:
				print(" OK")
示例#14
0
def process_headings(dom: EasyXmlTree, textf: str, toc_list: list,
                     nest_under_halftitle: bool, single_file: bool):
    """
	Find headings in current file and extract title data
	into items added to toc_list.

	INPUTS:
	dom: an EasyXmlTree representation of the current file
	textf: the path to the file
	toc_list: the list of ToC items we are building
	nest_under_halftitle: does this item need to be nested?
	single_file: is there only a single content item in the production?

	OUTPUTS:
	None
	"""

    body = dom.xpath("//body")
    place = Position.NONE
    if body:
        place = get_place(body[0])
    else:
        raise se.InvalidInputException("Couldn't locate body node")

    is_toplevel = True

    # Find all the hgroups and h1, h2 etc headings.
    heads = dom.xpath("//hgroup | //h1 | //h2 | //h3 | //h4 | //h5 | //h6")

    # special treatment where we can't find any header or hgroups
    if not heads:  # May be a dedication or an epigraph, with no heading tag.
        if single_file and nest_under_halftitle:
            # There's a halftitle, but only this one content file with no subsections,
            # so leave out of ToC because the Toc will link to the halftitle.
            return
        special_item = TocItem()
        # Need to determine level depth.
        # We don't have a heading, so get first content item
        content_item = dom.xpath("//p | //header | //img")
        if content_item is not None:
            parents = content_item[0].xpath(
                "./ancestor::*[name() = 'section' or name() = 'article']")
            special_item.level = len(parents)
            if special_item.level == 0:
                special_item.level = 1
        if nest_under_halftitle:
            special_item.level += 1
        special_item.title = dom.xpath(
            "//head/title/text()",
            True)  # Use the page title as the ToC entry title.
        if special_item.title is None:
            special_item.title = "NO TITLE"
        special_item.file_link = textf
        toc_list.append(special_item)
        return

    for heading in heads:
        # don't process a heading separately if it's within a hgroup
        if heading.parent.tag == "hgroup":
            continue  # skip it

        if place == Position.BODY:
            toc_item = process_a_heading(heading, textf, is_toplevel,
                                         single_file)
        else:
            # if it's not a bodymatter item we don't care about whether it's single_file
            toc_item = process_a_heading(heading, textf, is_toplevel, False)

        # Tricky check to see if we want to include the item because there's a halftitle
        # but only a single content file with no subsidiary sections.
        if is_toplevel and single_file and nest_under_halftitle and len(
                heads) == 1:
            continue
        if nest_under_halftitle:
            toc_item.level += 1
        is_toplevel = False
        toc_list.append(toc_item)
示例#15
0
    def generate_endnotes(self) -> str:
        """
		The generate_endnotes() function is very big so for readability and maintainability
		it's broken out to a separate file. Strictly speaking that file can be inlined
		into this class.
		"""

        processed = 0
        report = ""
        current_note_number = 1
        notes_changed = 0
        change_list = []

        for file_name in self.get_content_files():
            if file_name in [
                    "titlepage.xhtml", "colophon.xhtml", "uncopyright.xhtml",
                    "imprint.xhtml", "halftitle.xhtml", "endnotes.xhtml"
            ]:
                continue

            processed += 1

            file_path = self.path / "src" / "epub" / "text" / file_name
            try:
                with open(file_path) as file:
                    soup = BeautifulSoup(file.read(), "lxml")
            except:
                raise se.InvalidFileException("Could't open file: {}".format(
                    str(file_path)))

            links = soup.find_all("a")
            needs_rewrite = False
            for link in links:
                epub_type = link.get("epub:type") or ""
                if epub_type == "noteref":
                    old_anchor = ""
                    href = link.get("href") or ""
                    if href:
                        # Extract just the anchor from a URL (ie, what follows a hash symbol)
                        old_anchor = ""

                        hash_position = href.find(
                            "#") + 1  # we want the characters AFTER the hash
                        if hash_position > 0:
                            old_anchor = href[hash_position:]

                    new_anchor = "note-{:d}".format(current_note_number)
                    if new_anchor != old_anchor:
                        change_list.append("Changed " + old_anchor + " to " +
                                           new_anchor + " in " + file_name)
                        notes_changed += 1
                        # Update the link in the soup object
                        link["href"] = 'endnotes.xhtml#' + new_anchor
                        link["id"] = 'noteref-{:d}'.format(current_note_number)
                        link.string = str(current_note_number)
                        needs_rewrite = True
                    # Now try to find this in endnotes
                    matches = list(
                        filter(lambda x, old=old_anchor: x.anchor == old,
                               self.endnotes))
                    if not matches:
                        raise se.InvalidInputException(
                            "Couldn't find endnote with anchor " + old_anchor)
                    if len(matches) > 1:
                        raise se.InvalidInputException(
                            "Duplicate anchors in endnotes file for anchor " +
                            old_anchor)
                    # Found a single match, which is what we want
                    endnote = matches[0]
                    endnote.number = current_note_number
                    endnote.matched = True
                    # We don't change the anchor or the back ref just yet
                    endnote.source_file = file_name
                    current_note_number += 1

            # If we need to write back the body text file
            if needs_rewrite:
                new_file = open(file_path, "w")
                new_file.write(se.formatting.format_xhtml(str(soup)))
                new_file.close()

        if processed == 0:
            report += "No files processed. Did you update the manifest and order the spine?" + "\n"
        else:
            report += "Found {:d} endnotes.".format(current_note_number -
                                                    1) + "\n"
            if notes_changed > 0:
                # Now we need to recreate the endnotes file
                ol_tag = self._endnotes_soup.ol
                ol_tag.clear()
                for endnote in self.endnotes:
                    if endnote.matched:
                        li_tag = self._endnotes_soup.new_tag("li")
                        li_tag["id"] = "note-" + str(endnote.number)
                        li_tag["epub:type"] = "endnote"
                        for content in endnote.contents:
                            if isinstance(content, Tag):
                                links = content.find_all("a")
                                for link in links:
                                    epub_type = link.get("epub:type") or ""
                                    if epub_type == "se:referrer":
                                        href = link.get("href") or ""
                                        if href:
                                            link[
                                                "href"] = endnote.source_file + "#noteref-" + str(
                                                    endnote.number)
                            li_tag.append(content)
                        ol_tag.append(li_tag)

                with open(
                        self.path / "src" / "epub" / "text" / "endnotes.xhtml",
                        "w") as file:
                    file.write(
                        se.formatting.format_xhtml(str(self._endnotes_soup),
                                                   is_endnotes_file=True))

                report += "Changed {:d} endnote{}.".format(
                    notes_changed, "s" if notes_changed != 1 else "")
            else:
                report += "No changes made."
        return report
示例#16
0
	def generate_endnotes(self) -> Tuple[int, int]:
		"""
		Read the epub spine to regenerate all endnotes in order of appearance, starting from 1.
		Changes are written to disk.

		Returns a tuple of (found_endnote_count, changed_endnote_count)
		"""

		processed = 0
		current_note_number = 1
		notes_changed = 0
		change_list = []

		for file_name in self.get_content_files():
			if file_name in ["titlepage.xhtml", "colophon.xhtml", "uncopyright.xhtml", "imprint.xhtml", "halftitle.xhtml", "endnotes.xhtml"]:
				continue

			processed += 1

			file_path = self.path / "src/epub/text" / file_name
			try:
				with open(file_path) as file:
					soup = BeautifulSoup(file.read(), "lxml")
			except:
				raise se.InvalidFileException(f"Couldn’t open file: [path][link=file://{file_path}]{file_path}[/][/].")

			links = soup.find_all("a")
			needs_rewrite = False
			for link in links:
				epub_type = link.get("epub:type") or ""
				if epub_type == "noteref":
					old_anchor = ""
					href = link.get("href") or ""
					if href:
						# Extract just the anchor from a URL (ie, what follows a hash symbol)
						old_anchor = ""

						hash_position = href.find("#") + 1  # we want the characters AFTER the hash
						if hash_position > 0:
							old_anchor = href[hash_position:]

					new_anchor = f"note-{current_note_number:d}"
					if new_anchor != old_anchor:
						change_list.append(f"Changed {old_anchor} to {new_anchor} in {file_name}")
						notes_changed += 1
						# Update the link in the soup object
						link["href"] = 'endnotes.xhtml#' + new_anchor
						link["id"] = f'noteref-{current_note_number:d}'
						link.string = str(current_note_number)
						needs_rewrite = True
					# Now try to find this in endnotes
					match_old = lambda x, old=old_anchor: x.anchor == old
					matches = list(filter(match_old, self.endnotes))
					if not matches:
						raise se.InvalidInputException(f"Couldn’t find endnote with anchor [attr]{old_anchor}[/].")
					if len(matches) > 1:
						raise se.InvalidInputException(f"Duplicate anchors in endnotes file for anchor [attr]{old_anchor}[/].")
					# Found a single match, which is what we want
					endnote = matches[0]
					endnote.number = current_note_number
					endnote.matched = True
					# We don't change the anchor or the back ref just yet
					endnote.source_file = file_name
					current_note_number += 1

			# If we need to write back the body text file
			if needs_rewrite:
				new_file = open(file_path, "w")
				new_file.write(se.formatting.format_xhtml(str(soup)))
				new_file.close()

		if processed == 0:
			raise se.InvalidInputException("No files processed. Did you update the manifest and order the spine?")

		if notes_changed > 0:
			# Now we need to recreate the endnotes file
			ol_tag = self._endnotes_soup.ol
			ol_tag.clear()

			self.endnotes.sort(key=lambda endnote: endnote.number)

			for endnote in self.endnotes:
				if endnote.matched:
					li_tag = self._endnotes_soup.new_tag("li")
					li_tag["id"] = "note-" + str(endnote.number)
					li_tag["epub:type"] = "endnote"
					for content in endnote.contents:
						if isinstance(content, Tag):
							links = content.find_all("a")
							for link in links:
								epub_type = link.get("epub:type") or ""
								if epub_type == "backlink":
									href = link.get("href") or ""
									if href:
										link["href"] = endnote.source_file + "#noteref-" + str(endnote.number)
						li_tag.append(content)
					ol_tag.append(li_tag)

			with open(self.path / "src" / "epub" / "text" / "endnotes.xhtml", "w") as file:
				file.write(se.formatting.format_xhtml(str(self._endnotes_soup)))

		return (current_note_number - 1, notes_changed)
示例#17
0
def create_draft(args: list):
    """
	Entry point for `se create-draft`
	"""

    # Put together some variables for later use
    identifier = se.formatting.make_url_safe(
        args.author) + "/" + se.formatting.make_url_safe(args.title)
    title_string = args.title.replace(
        "'", "’") + ", by " + args.author.replace("'", "’")
    sorted_title = regex.sub(r"^(A|An|The) (.+)$", "\\2, \\1", args.title)
    pg_producers = []

    if args.translator:
        identifier = identifier + "/" + se.formatting.make_url_safe(
            args.translator)
        title_string = title_string + ". Translated by " + args.translator

    if args.illustrator:
        identifier = identifier + "/" + se.formatting.make_url_safe(
            args.illustrator)
        title_string = title_string + ". Illustrated by " + args.illustrator

    repo_name = Path(identifier.replace("/", "_"))

    if repo_name.is_dir():
        raise se.InvalidInputException(
            "./{}/ already exists.".format(repo_name))

    # Download PG HTML and do some fixups
    if args.pg_url:
        args.pg_url = args.pg_url.replace("http://", "https://")

        # Get the ebook metadata
        try:
            response = requests.get(args.pg_url)
            pg_metadata_html = response.text
        except Exception as ex:
            raise se.RemoteCommandErrorException(
                "Couldn’t download Project Gutenberg ebook metadata page. Error: {}"
                .format(ex))

        soup = BeautifulSoup(pg_metadata_html, "lxml")

        # Get the ebook HTML URL from the metadata
        pg_ebook_url = None
        for element in soup.select("a[type^=\"text/html\"]"):
            pg_ebook_url = regex.sub(r"^//", "https://", element["href"])
            pg_ebook_url = regex.sub(r"^/", "https://www.gutenberg.org/",
                                     pg_ebook_url)

        if not pg_ebook_url:
            raise se.RemoteCommandErrorException(
                "Could download ebook metadata, but couldn’t find URL for the ebook HTML."
            )

        # Get the ebook LCSH categories
        pg_subjects = []
        for element in soup.select("td[property=\"dcterms:subject\"]"):
            if element["datatype"] == "dcterms:LCSH":
                for subject_link in element.find("a"):
                    pg_subjects.append(subject_link.strip())

        # Get the PG publication date
        pg_publication_year = None
        for element in soup.select("td[itemprop=\"datePublished\"]"):
            pg_publication_year = regex.sub(r".+?([0-9]{4})", "\\1",
                                            element.text)

        # Get the actual ebook URL
        try:
            response = requests.get(pg_ebook_url)
            pg_ebook_html = response.text
        except Exception as ex:
            raise se.RemoteCommandErrorException(
                "Couldn’t download Project Gutenberg ebook HTML. Error: {}".
                format(ex))

        try:
            fixed_pg_ebook_html = fix_text(pg_ebook_html, uncurl_quotes=False)
            pg_ebook_html = se.strip_bom(fixed_pg_ebook_html)
        except Exception as ex:
            raise se.InvalidEncodingException(
                "Couldn’t determine text encoding of Project Gutenberg HTML file. Error: {}"
                .format(ex))

        # Try to guess the ebook language
        pg_language = "en-US"
        if "colour" in pg_ebook_html or "favour" in pg_ebook_html or "honour" in pg_ebook_html:
            pg_language = "en-GB"

    # Create necessary directories
    (repo_name / "images").mkdir(parents=True)
    (repo_name / "src" / "epub" / "css").mkdir(parents=True)
    (repo_name / "src" / "epub" / "images").mkdir(parents=True)
    (repo_name / "src" / "epub" / "text").mkdir(parents=True)
    (repo_name / "src" / "META-INF").mkdir(parents=True)

    # Write PG data if we have it
    if args.pg_url and pg_ebook_html:
        try:
            soup = BeautifulSoup(pg_ebook_html, "html.parser")

            # Try to get the PG producers.  We only try this if there's a <pre> block with the header info (which is not always the case)
            for element in soup(text=regex.compile(r"\*\*\*\s*Produced by.+$",
                                                   flags=regex.DOTALL)):
                if element.parent.name == "pre":
                    pg_producers = regex.sub(r".+?Produced by (.+?)\s*$",
                                             "\\1",
                                             element,
                                             flags=regex.DOTALL)
                    pg_producers = regex.sub(r"\(.+?\)",
                                             "",
                                             pg_producers,
                                             flags=regex.DOTALL)
                    pg_producers = regex.sub(r"(at )?https?://www\.pgdp\.net",
                                             "",
                                             pg_producers,
                                             flags=regex.DOTALL)
                    pg_producers = regex.sub(r"[\r\n]+",
                                             " ",
                                             pg_producers,
                                             flags=regex.DOTALL)
                    pg_producers = regex.sub(r",? and ", ", and ",
                                             pg_producers)
                    pg_producers = pg_producers.replace(
                        " and the Online", " and The Online")
                    pg_producers = pg_producers.replace(
                        ", and ", ", ").strip().split(", ")

            # Try to strip out the PG header
            for element in soup(text=regex.compile(r"\*\*\*\s*START OF THIS")):
                for sibling in element.parent.find_previous_siblings():
                    sibling.decompose()

                element.parent.decompose()

            # Try to strip out the PG license footer
            for element in soup(
                    text=regex.compile(r"End of (the )?Project Gutenberg")):
                for sibling in element.parent.find_next_siblings():
                    sibling.decompose()

                element.parent.decompose()

            with open(repo_name / "src" / "epub" / "text" / "body.xhtml",
                      "w",
                      encoding="utf-8") as file:
                file.write(str(soup))
        except IOError as ex:
            raise se.InvalidFileException(
                "Couldn’t write to ebook directory. Error: {}".format(ex))
        except:
            raise se.InvalidInputException(
                "Couldn’t parse Project Gutenberg ebook source. This is usually due to invalid HTML in the ebook."
            )

    # Copy over templates
    shutil.copy(
        resource_filename("se", str(Path("data") / "templates" / "gitignore")),
        repo_name / ".gitignore")
    shutil.copy(
        resource_filename("se",
                          str(Path("data") / "templates" / "LICENSE.md")),
        repo_name)
    shutil.copy(
        resource_filename(
            "se",
            str(Path("data") / "templates" / "META-INF" / "container.xml")),
        repo_name / "src" / "META-INF")
    shutil.copy(
        resource_filename("se", str(Path("data") / "templates" / "mimetype")),
        repo_name / "src")
    shutil.copy(
        resource_filename("se",
                          str(Path("data") / "templates" / "content.opf")),
        repo_name / "src" / "epub")
    shutil.copy(
        resource_filename("se", str(Path("data") / "templates" / "onix.xml")),
        repo_name / "src" / "epub")
    shutil.copy(
        resource_filename("se", str(Path("data") / "templates" / "toc.xhtml")),
        repo_name / "src" / "epub")
    shutil.copy(
        resource_filename("se", str(Path("data") / "templates" / "core.css")),
        repo_name / "src" / "epub" / "css")
    shutil.copy(
        resource_filename("se", str(Path("data") / "templates" / "local.css")),
        repo_name / "src" / "epub" / "css")
    shutil.copy(
        resource_filename("se", str(Path("data") / "templates" / "logo.svg")),
        repo_name / "src" / "epub" / "images")
    shutil.copy(
        resource_filename("se",
                          str(Path("data") / "templates" / "colophon.xhtml")),
        repo_name / "src" / "epub" / "text")
    shutil.copy(
        resource_filename("se",
                          str(Path("data") / "templates" / "imprint.xhtml")),
        repo_name / "src" / "epub" / "text")
    shutil.copy(
        resource_filename("se",
                          str(Path("data") / "templates" / "titlepage.xhtml")),
        repo_name / "src" / "epub" / "text")
    shutil.copy(
        resource_filename(
            "se", str(Path("data") / "templates" / "uncopyright.xhtml")),
        repo_name / "src" / "epub" / "text")
    shutil.copy(
        resource_filename("se",
                          str(Path("data") / "templates" / "titlepage.svg")),
        repo_name / "images")
    shutil.copy(
        resource_filename("se", str(Path("data") / "templates" / "cover.jpg")),
        repo_name / "images" / "cover.jpg")
    shutil.copy(
        resource_filename("se", str(Path("data") / "templates" / "cover.svg")),
        repo_name / "images" / "cover.svg")

    # Try to find Wikipedia links if possible
    author_wiki_url, author_nacoaf_url = _get_wikipedia_url(args.author, True)
    ebook_wiki_url, _ = _get_wikipedia_url(args.title, False)
    translator_wiki_url = None
    if args.translator:
        translator_wiki_url, translator_nacoaf_url = _get_wikipedia_url(
            args.translator, True)

    # Pre-fill a few templates
    se.replace_in_file(repo_name / "src" / "epub" / "text" / "titlepage.xhtml",
                       "TITLE_STRING", title_string)
    se.replace_in_file(repo_name / "images" / "titlepage.svg", "TITLE_STRING",
                       title_string)
    se.replace_in_file(repo_name / "images" / "cover.svg", "TITLE_STRING",
                       title_string)

    # Create the titlepage SVG
    contributors = {}
    if args.translator:
        contributors["translated by"] = args.translator

    if args.illustrator:
        contributors["illustrated by"] = args.illustrator

    with open(repo_name / "images" / "titlepage.svg", "w",
              encoding="utf-8") as file:
        file.write(
            _generate_titlepage_svg(args.title, args.author, contributors,
                                    title_string))

    # Create the cover SVG
    with open(repo_name / "images" / "cover.svg", "w",
              encoding="utf-8") as file:
        file.write(_generate_cover_svg(args.title, args.author, title_string))

    if args.pg_url:
        se.replace_in_file(
            repo_name / "src" / "epub" / "text" / "imprint.xhtml", "PG_URL",
            args.pg_url)

    with open(repo_name / "src" / "epub" / "text" / "colophon.xhtml",
              "r+",
              encoding="utf-8") as file:
        colophon_xhtml = file.read()

        colophon_xhtml = colophon_xhtml.replace("SE_IDENTIFIER", identifier)
        colophon_xhtml = colophon_xhtml.replace(">AUTHOR<",
                                                ">{}<".format(args.author))
        colophon_xhtml = colophon_xhtml.replace("TITLE", args.title)

        if author_wiki_url:
            colophon_xhtml = colophon_xhtml.replace("AUTHOR_WIKI_URL",
                                                    author_wiki_url)

        if args.pg_url:
            colophon_xhtml = colophon_xhtml.replace("PG_URL", args.pg_url)

            if pg_publication_year:
                colophon_xhtml = colophon_xhtml.replace(
                    "PG_YEAR", pg_publication_year)

            if pg_producers:
                producers_xhtml = ""
                for i, producer in enumerate(pg_producers):
                    if "Distributed Proofreading" in producer:
                        producers_xhtml = producers_xhtml + "<a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a>"
                    else:
                        producers_xhtml = producers_xhtml + "<b class=\"name\">{}</b>".format(
                            producer)

                    if i < len(pg_producers) - 1:
                        producers_xhtml = producers_xhtml + ", "

                    if i == len(pg_producers) - 2:
                        producers_xhtml = producers_xhtml + "and "

                producers_xhtml = producers_xhtml + "<br/>"

                colophon_xhtml = colophon_xhtml.replace(
                    "<b class=\"name\">TRANSCRIBER_1</b>, <b class=\"name\">TRANSCRIBER_2</b>, and <a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a><br/>",
                    producers_xhtml)

        file.seek(0)
        file.write(colophon_xhtml)
        file.truncate()

    with open(repo_name / "src" / "epub" / "content.opf",
              "r+",
              encoding="utf-8") as file:
        metadata_xhtml = file.read()

        metadata_xhtml = metadata_xhtml.replace("SE_IDENTIFIER", identifier)
        metadata_xhtml = metadata_xhtml.replace(">AUTHOR<",
                                                ">{}<".format(args.author))
        metadata_xhtml = metadata_xhtml.replace(">TITLE_SORT<",
                                                ">{}<".format(sorted_title))
        metadata_xhtml = metadata_xhtml.replace(">TITLE<",
                                                ">{}<".format(args.title))
        metadata_xhtml = metadata_xhtml.replace("VCS_IDENTIFIER",
                                                str(repo_name))

        if pg_producers:
            producers_xhtml = ""
            i = 1
            for producer in pg_producers:
                producers_xhtml = producers_xhtml + "\t\t<dc:contributor id=\"transcriber-{}\">{}</dc:contributor>\n".format(
                    i, producer)

                if "Distributed Proofreading" in producer:
                    producers_xhtml = producers_xhtml + "\t\t<meta property=\"file-as\" refines=\"#transcriber-{0}\">Online Distributed Proofreading Team, The</meta>\n\t\t<meta property=\"se:url.homepage\" refines=\"#transcriber-{0}\">https://pgdp.net</meta>\n".format(
                        i)
                else:
                    producers_xhtml = producers_xhtml + "\t\t<meta property=\"file-as\" refines=\"#transcriber-{}\">TRANSCRIBER_SORT</meta>\n".format(
                        i)

                producers_xhtml = producers_xhtml + "\t\t<meta property=\"role\" refines=\"#transcriber-{}\" scheme=\"marc:relators\">trc</meta>\n".format(
                    i)

                i = i + 1

            metadata_xhtml = regex.sub(
                r"\t\t<dc:contributor id=\"transcriber-1\">TRANSCRIBER</dc:contributor>\s*<meta property=\"file-as\" refines=\"#transcriber-1\">TRANSCRIBER_SORT</meta>\s*<meta property=\"se:url.homepage\" refines=\"#transcriber-1\">TRANSCRIBER_URL</meta>\s*<meta property=\"role\" refines=\"#transcriber-1\" scheme=\"marc:relators\">trc</meta>",
                "\t\t" + producers_xhtml.strip(),
                metadata_xhtml,
                flags=regex.DOTALL)

        if author_wiki_url:
            metadata_xhtml = metadata_xhtml.replace(
                ">AUTHOR_WIKI_URL<", ">{}<".format(author_wiki_url))

        if author_nacoaf_url:
            metadata_xhtml = metadata_xhtml.replace(
                ">AUTHOR_NACOAF_URL<", ">{}<".format(author_nacoaf_url))

        if ebook_wiki_url:
            metadata_xhtml = metadata_xhtml.replace(
                ">EBOOK_WIKI_URL<", ">{}<".format(ebook_wiki_url))

        if args.translator:
            metadata_xhtml = metadata_xhtml.replace(
                ">TRANSLATOR<", ">{}<".format(args.translator))

            if translator_wiki_url:
                metadata_xhtml = metadata_xhtml.replace(
                    ">TRANSLATOR_WIKI_URL<",
                    ">{}<".format(translator_wiki_url))

            if translator_nacoaf_url:
                metadata_xhtml = metadata_xhtml.replace(
                    ">TRANSLATOR_NACOAF_URL<",
                    ">{}<".format(translator_nacoaf_url))
        else:
            metadata_xhtml = regex.sub(
                r"<dc:contributor id=\"translator\">.+?<dc:contributor id=\"artist\">",
                "<dc:contributor id=\"artist\">",
                metadata_xhtml,
                flags=regex.DOTALL)

        if args.pg_url:
            if pg_subjects:
                subject_xhtml = ""

                i = 1
                for subject in pg_subjects:
                    subject_xhtml = subject_xhtml + "\t\t<dc:subject id=\"subject-{}\">{}</dc:subject>\n".format(
                        i, subject)
                    i = i + 1

                i = 1
                for subject in pg_subjects:
                    subject_xhtml = subject_xhtml + "\t\t<meta property=\"authority\" refines=\"#subject-{}\">LCSH</meta>\n".format(
                        i)

                    # Now, get the LCSH ID by querying LCSH directly.
                    try:
                        response = requests.get(
                            "http://id.loc.gov/search/?q=%22{}%22".format(
                                urllib.parse.quote(subject)))
                        result = regex.search(
                            r"<a title=\"Click to view record\" href=\"/authorities/subjects/([^\"]+?)\">{}</a>"
                            .format(regex.escape(subject.replace(" -- ",
                                                                 "--"))),
                            response.text)

                        loc_id = "Unknown"
                        try:
                            loc_id = result.group(1)
                        except Exception as ex:
                            pass

                        subject_xhtml = subject_xhtml + "\t\t<meta property=\"term\" refines=\"#subject-{}\">{}</meta>\n".format(
                            i, loc_id)

                    except Exception as ex:
                        raise se.RemoteCommandErrorException(
                            "Couldn’t connect to id.loc.gov. Error: {}".format(
                                ex))

                    i = i + 1

                metadata_xhtml = regex.sub(
                    r"\t\t<dc:subject id=\"subject-1\">SUBJECT_1</dc:subject>\s*<dc:subject id=\"subject-2\">SUBJECT_2</dc:subject>\s*<meta property=\"authority\" refines=\"#subject-1\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-1\">LCSH_ID_1</meta>\s*<meta property=\"authority\" refines=\"#subject-2\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-2\">LCSH_ID_2</meta>",
                    "\t\t" + subject_xhtml.strip(), metadata_xhtml)

            metadata_xhtml = metadata_xhtml.replace(
                "<dc:language>LANG</dc:language>",
                "<dc:language>{}</dc:language>".format(pg_language))
            metadata_xhtml = metadata_xhtml.replace(
                "<dc:source>PG_URL</dc:source>",
                "<dc:source>{}</dc:source>".format(args.pg_url))

        file.seek(0)
        file.write(metadata_xhtml)
        file.truncate()

    # Set up local git repo
    repo = git.Repo.init(repo_name)

    if args.email:
        with repo.config_writer() as config:
            config.set_value("user", "email", args.email)

    # Set up remote git repos
    if args.create_se_repo:
        git_command = git.cmd.Git(repo_name)
        git_command.remote(
            "add", "origin",
            "standardebooks.org:/standardebooks.org/ebooks/{}.git".format(
                repo_name))

        # Set git to automatically push to SE
        git_command.config("branch.master.remote", "origin")
        git_command.config("branch.master.merge", "refs/heads/master")

        github_option = ""
        if args.create_github_repo:
            github_option = "--github"

        return_code = call([
            "ssh", "standardebooks.org",
            "/standardebooks.org/scripts/init-se-repo --repo-name={} --title-string=\"{}\" {}"
            .format(repo_name, title_string, github_option)
        ])
        if return_code != 0:
            raise se.RemoteCommandErrorException(
                "Failed to create repository on Standard Ebooks server: ssh returned code {}."
                .format(return_code))
示例#18
0
def process_a_heading(node: EasyXmlElement, textf: str, is_toplevel: bool,
                      single_file: bool) -> TocItem:
    """
	Generate and return a single TocItem from this heading.

	INPUTS:
	node: an EasyXml node representing a heading
	text: the path to the file
	is_toplevel: is this heading at the top-most level in the file?
	single_file: is there only one content file in the production (like some Poetry volumes)?

	OUTPUTS:
	a qualified ToCItem object
	"""

    toc_item = TocItem()
    parent_sections = node.xpath(
        "./ancestor::*[name() = 'section' or name() = 'article']")
    if parent_sections:
        toc_item.level = len(parent_sections)
    else:
        toc_item.level = 1

    toc_item.division = get_book_division(node)

    # is_top_level stops the first heading in a file getting an anchor id, we don't generally want that.
    # The exceptions are things like poems within a single-file volume.
    toc_item.id = get_parent_id(node)  # pylint: disable=invalid-name
    if toc_item.id == "":
        toc_item.file_link = textf
    else:
        if not is_toplevel:
            toc_item.file_link = f"{textf}#{toc_item.id}"
        elif single_file:  # It IS the first heading in the file, but there's only a single content file?
            toc_item.file_link = f"{textf}#{toc_item.id}"
        else:
            toc_item.file_link = textf

    toc_item.lang = node.get_attr("xml:lang")

    epub_type = node.get_attr("epub:type")

    # it may be an empty header tag eg <h3>, so we pass its parent rather than itself to evaluate the parent's descendants
    if not epub_type and node.tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:
        parent = node.parent
        if parent:
            evaluate_descendants(parent, toc_item)
        else:  # shouldn't ever happen, but... just in case, raise an error
            raise se.InvalidInputException(
                f"Header without parent in file: [path][link=file://{textf}]{textf}[/][/]."
            )
        return toc_item
    if epub_type:
        # A heading may include z3998:roman directly,
        # eg <h5 epub:type="title z3998:roman">II</h5>.
        if "z3998:roman" in epub_type:
            toc_item.roman = extract_strings(node)
            toc_item.title = f"<span epub:type=\"z3998:roman\">{toc_item.roman}</span>"
            return toc_item
        if "ordinal" in epub_type:  # but not a roman numeral (eg in Nietzche's Beyond Good and Evil)
            toc_item.title = extract_strings(node)
            toc_item.title_is_ordinal = True
            return toc_item
        # may be the halftitle page with a subtitle, so we need to burrow down
        if ("fulltitle" in epub_type) and (node.tag == "hgroup"):
            evaluate_descendants(node, toc_item)
            return toc_item
        # or it may be a straightforward one-level title eg: <h2 epub:type="title">Imprint</h2>
        if "title" in epub_type:
            toc_item.title = extract_strings(node)
            return toc_item

    # otherwise, burrow down into its structure to get the info
    evaluate_descendants(node, toc_item)

    return toc_item
def process_headings(dom: EasyXmlTree, textf: str, toc_list: list,
                     nest_under_halftitle: bool, single_file: bool) -> None:
    """
	Find headings in current file and extract title data
	into items added to toc_list.

	INPUTS:
	dom: an EasyXmlTree representation of the current file
	textf: the path to the file
	toc_list: the list of ToC items we are building
	nest_under_halftitle: does this item need to be nested?
	single_file: is there only a single content item in the production?

	OUTPUTS:
	None
	"""

    body = dom.xpath("//body")
    place = Position.NONE
    if body:
        place = get_place(body[0])
    else:
        raise se.InvalidInputException("Couldn’t locate [xhtml]<body>[/].")

    is_toplevel = True

    # Find all the hgroups and h1, h2 etc headings.
    heads = dom.xpath("//hgroup | //h1 | //h2 | //h3 | //h4 | //h5 | //h6")

    # special treatment where we can't find any header or hgroups
    if not heads:  # May be a dedication or an epigraph, with no heading tag.
        if single_file and nest_under_halftitle:
            # There's a halftitle, but only this one content file with no subsections,
            # so leave out of ToC because the Toc will link to the halftitle.
            return
        special_item = TocItem()
        # Need to determine level depth.
        # We don't have a heading, so get first content item
        content_item = dom.xpath("//p | //header | //img")
        if content_item is not None:
            special_item.level = get_level(content_item[0], toc_list)
        special_item.title = dom.xpath(
            "//head/title/text()",
            True)  # Use the page title as the ToC entry title.
        if special_item.title is None:
            special_item.title = "NO TITLE"
        special_item.file_link = textf
        special_item.place = place
        toc_list.append(special_item)
        return

    for heading in heads:
        # don't process a heading separately if it's within a hgroup
        if heading.parent.tag == "hgroup":
            continue  # skip it

        if place == Position.BODY:
            toc_item = process_a_heading(heading, textf, is_toplevel,
                                         single_file)
        else:
            # if it's not a bodymatter item we don't care about whether it's single_file
            toc_item = process_a_heading(heading, textf, is_toplevel, False)

        toc_item.level = get_level(heading, toc_list)
        toc_item.place = place

        # Exception: The titlepage always has is titled 'titlepage' in the ToC
        if dom.xpath("//section[re:test(@epub:type, '\\btitlepage\\b')]"):
            toc_item.title = "Titlepage"

        is_toplevel = False
        toc_list.append(toc_item)
示例#20
0
    def generate_endnotes(self) -> Tuple[int, int]:
        """
		Read the epub spine to regenerate all endnotes in order of appearance, starting from 1.
		Changes are written to disk.

		Returns a tuple of (found_endnote_count, changed_endnote_count)
		"""

        processed = 0
        current_note_number = 1
        notes_changed = 0
        change_list = []

        for file_name in self.get_content_files():
            if file_name in [
                    "titlepage.xhtml", "colophon.xhtml", "uncopyright.xhtml",
                    "imprint.xhtml", "halftitlepage.xhtml", "endnotes.xhtml"
            ]:
                continue

            processed += 1

            file_path = self.path / "src/epub/text" / file_name
            try:
                dom = self.get_dom(file_path)
            except Exception as ex:
                raise se.InvalidFileException(
                    f"Couldn’t open file: [path][link=file://{file_path}]{file_path}[/][/]."
                ) from ex

            needs_rewrite = False
            for link in dom.xpath(
                    "/html/body//a[contains(@epub:type, 'noteref')]"):
                old_anchor = ""
                href = link.get_attr("href") or ""
                if href:
                    # Extract just the anchor from a URL (ie, what follows a hash symbol)
                    hash_position = href.find(
                        "#") + 1  # we want the characters AFTER the hash
                    if hash_position > 0:
                        old_anchor = href[hash_position:]

                new_anchor = f"note-{current_note_number:d}"
                if new_anchor != old_anchor:
                    change_list.append(
                        f"Changed {old_anchor} to {new_anchor} in {file_name}")
                    notes_changed += 1
                    # Update the link in the dom
                    link.set_attr("href", f"endnotes.xhtml#{new_anchor}")
                    link.set_attr("id", f"noteref-{current_note_number:d}")
                    link.lxml_element.text = str(current_note_number)
                    needs_rewrite = True

                # Now try to find this in endnotes
                match_old = lambda x, old=old_anchor: x.anchor == old
                matches = list(filter(match_old, self.endnotes))
                if not matches:
                    raise se.InvalidInputException(
                        f"Couldn’t find endnote with anchor [attr]{old_anchor}[/]."
                    )
                if len(matches) > 1:
                    raise se.InvalidInputException(
                        f"Duplicate anchors in endnotes file for anchor [attr]{old_anchor}[/]."
                    )
                # Found a single match, which is what we want
                endnote = matches[0]
                endnote.number = current_note_number
                endnote.matched = True
                # We don't change the anchor or the back ref just yet
                endnote.source_file = file_name
                current_note_number += 1

            # If we need to write back the body text file
            if needs_rewrite:
                with open(file_path, "w") as file:
                    file.write(se.formatting.format_xhtml(dom.to_string()))

        if processed == 0:
            raise se.InvalidInputException(
                "No files processed. Did you update the manifest and order the spine?"
            )

        if notes_changed > 0:
            # Now we need to recreate the endnotes file
            endnotes_dom = self.get_dom(self.path / "src" / "epub" / "text" /
                                        "endnotes.xhtml")
            for ol_node in endnotes_dom.xpath(
                    "/html/body/section[contains(@epub:type, 'endnotes')]/ol[1]"
            ):
                for node in ol_node.xpath(
                        "./li[contains(@epub:type, 'endnote')]"):
                    node.remove()

                self.endnotes.sort(key=lambda endnote: endnote.number)

                for endnote in self.endnotes:
                    if endnote.matched:
                        endnote.node.set_attr("id", f"note-{endnote.number}")

                        for node in endnote.node.xpath(
                                ".//a[contains(@epub:type, 'backlink')]"):
                            node.set_attr(
                                "href",
                                f"{endnote.source_file}#noteref-{endnote.number}"
                            )

                        ol_node.append(endnote.node)

            with open(self.path / "src" / "epub" / "text" / "endnotes.xhtml",
                      "w") as file:
                file.write(se.formatting.format_xhtml(
                    endnotes_dom.to_string()))

        return (current_note_number - 1, notes_changed)
def evaluate_descendants(node: EasyXmlElement, toc_item: TocItem,
                         textf: str) -> TocItem:
    """
	Burrow down into a hgroup structure to qualify the ToC item

	INPUTS:
	node: EasyXmlElement object representing a hgroup

	OUTPUTS:
	toc_item: qualified ToC item
	"""
    children = node.xpath("./h1 | ./h2 | ./h3 | ./h4 | ./h5 | ./h6")
    for child in children:  # we expect these to be h1, h2, h3, h4 etc
        if not toc_item.lang:
            toc_item.lang = child.get_attr("xml:lang")
        epub_type = child.get_attr("epub:type")

        if child.get_attr("hidden"):
            toc_item.hidden = True

        if not epub_type:
            # should be a label/ordinal grouping
            child_strings = get_child_strings(child)
            if "label" in child_strings and "ordinal" in child_strings:  # quick test
                toc_item.title_is_ordinal = True
                # strip label
                child_strings = regex.sub(
                    r"<span epub:type=\"label\">(.*?)</span>", " \\1 ",
                    child_strings)
                # remove ordinal if it's by itself in a span
                child_strings = regex.sub(
                    r"<span epub:type=\"ordinal\">(.*?)</span>", " \\1 ",
                    child_strings)
                # remove ordinal if it's joined with a roman (which we want to keep)
                child_strings = regex.sub(r"\bordinal\b", "", child_strings)
                # remove extra spaces
                child_strings = regex.sub(r"[ ]{2,}", " ", child_strings)
                # remove any carriage returns
                child_strings = regex.sub(r"\n", "", child_strings)
                # get rid of any endnotes
                child_strings = strip_notes(child_strings)
                toc_item.title = child_strings.strip()
            continue  # skip the following
        if "z3998:roman" in epub_type:
            toc_item.roman = extract_strings(child)
            try:
                roman.fromRoman(toc_item.roman)
            except roman.InvalidRomanNumeralError as err:
                raise se.InvalidInputException(
                    f"Heading tagged as roman numeral is invalid: [path][link=file://{textf}]{textf}[/][/]."
                ) from err
            if not toc_item.title:
                toc_item.title = f"<span epub:type=\"z3998:roman\">{toc_item.roman}</span>"
        elif "ordinal" in epub_type:  # but not a roman numeral or a labelled item, cases caught caught above
            if not toc_item.title:
                toc_item.title = extract_strings(child)
                toc_item.title_is_ordinal = True
        if "subtitle" in epub_type:
            toc_item.subtitle = extract_strings(child)
        else:
            if "title" in epub_type:  # this allows for `fulltitle` to work here, too
                if toc_item.title or toc_item.roman or toc_item.title_is_ordinal:  # if title already filled, must be a subtitle
                    toc_item.subtitle = extract_strings(child)
                else:
                    toc_item.title = extract_strings(child)
        if toc_item.title and toc_item.subtitle:  # then we're done
            return toc_item
    return toc_item
示例#22
0
def _create_draft(args: Namespace):
    """
	Implementation for `se create-draft`
	"""

    # Put together some variables for later use
    authors = []
    translators = []
    illustrators = []
    pg_producers = []
    title = args.title.replace("'", "’")

    for author in args.author:
        authors.append({
            "name": author.replace("'", "’"),
            "wiki_url": None,
            "nacoaf_url": None
        })

    if args.translator:
        for translator in args.translator:
            translators.append({
                "name": translator.replace("'", "’"),
                "wiki_url": None,
                "nacoaf_url": None
            })

    if args.illustrator:
        for illustrator in args.illustrator:
            illustrators.append({
                "name": illustrator.replace("'", "’"),
                "wiki_url": None,
                "nacoaf_url": None
            })

    title_string = title
    if authors and authors[0]["name"].lower() != "anonymous":
        title_string += ", by " + _generate_contributor_string(authors, False)

    identifier = ""
    for author in authors:
        identifier += se.formatting.make_url_safe(author["name"]) + "_"

    identifier = identifier.rstrip("_") + "/" + se.formatting.make_url_safe(
        title)

    sorted_title = regex.sub(r"^(A|An|The) (.+)$", "\\2, \\1", title)

    if translators:
        title_string = title_string + ". Translated by " + _generate_contributor_string(
            translators, False)

        identifier = identifier + "/"

        for translator in translators:
            identifier += se.formatting.make_url_safe(translator["name"]) + "_"

        identifier = identifier.rstrip("_")

    if illustrators:
        title_string = title_string + ". Illustrated by " + _generate_contributor_string(
            illustrators, False)

        identifier = identifier + "/"

        for illustrator in illustrators:
            identifier += se.formatting.make_url_safe(
                illustrator["name"]) + "_"

        identifier = identifier.rstrip("_")

    repo_name = identifier.replace("/", "_")

    repo_path = Path(repo_name).resolve()

    if repo_path.is_dir():
        raise se.InvalidInputException(
            f"Directory already exists: [path][link=file://{repo_path}]{repo_path}[/][/]."
        )

    # Get data on authors
    for i, author in enumerate(authors):
        if not args.offline and author["name"].lower() != "anonymous":
            author["wiki_url"], author["nacoaf_url"] = _get_wikipedia_url(
                author["name"], True)

    # Get data on translators
    for i, translator in enumerate(translators):
        if not args.offline and translator["name"].lower() != "anonymous":
            translator["wiki_url"], translator[
                "nacoaf_url"] = _get_wikipedia_url(translator["name"], True)

    # Get data on illlustrators
    for i, illustrator in enumerate(illustrators):
        if not args.offline and illustrator["name"].lower() != "anonymous":
            illustrator["wiki_url"], illustrator[
                "nacoaf_url"] = _get_wikipedia_url(illustrator["name"], True)

    # Download PG HTML and do some fixups
    if args.pg_url:
        if args.offline:
            raise se.RemoteCommandErrorException(
                "Cannot download Project Gutenberg ebook when offline option is enabled."
            )

        args.pg_url = args.pg_url.replace("http://", "https://")

        # Get the ebook metadata
        try:
            response = requests.get(args.pg_url)
            pg_metadata_html = response.text
        except Exception as ex:
            raise se.RemoteCommandErrorException(
                f"Couldn’t download Project Gutenberg ebook metadata page. Exception: {ex}"
            )

        parser = etree.HTMLParser()
        dom = etree.parse(StringIO(pg_metadata_html), parser)

        # Get the ebook HTML URL from the metadata
        pg_ebook_url = None
        for node in dom.xpath("/html/body//a[contains(@type, 'text/html')]"):
            pg_ebook_url = regex.sub(r"^//", "https://", node.get("href"))
            pg_ebook_url = regex.sub(r"^/", "https://www.gutenberg.org/",
                                     pg_ebook_url)

        if not pg_ebook_url:
            raise se.RemoteCommandErrorException(
                "Could download ebook metadata, but couldn’t find URL for the ebook HTML."
            )

        # Get the ebook LCSH categories
        pg_subjects = []
        for node in dom.xpath(
                "/html/body//td[contains(@property, 'dcterms:subject')]"):
            if node.get("datatype") == "dcterms:LCSH":
                for subject_link in node.xpath("./a"):
                    pg_subjects.append(subject_link.text.strip())

        # Get the PG publication date
        pg_publication_year = None
        for node in dom.xpath("//td[@itemprop='datePublished']"):
            pg_publication_year = regex.sub(r".+?([0-9]{4})", "\\1", node.text)

        # Get the actual ebook URL
        try:
            response = requests.get(pg_ebook_url)
            pg_ebook_html = response.text
        except Exception as ex:
            raise se.RemoteCommandErrorException(
                f"Couldn’t download Project Gutenberg ebook HTML. Exception: {ex}"
            )

        try:
            fixed_pg_ebook_html = fix_text(pg_ebook_html, uncurl_quotes=False)
            pg_ebook_html = se.strip_bom(fixed_pg_ebook_html)
        except Exception as ex:
            raise se.InvalidEncodingException(
                f"Couldn’t determine text encoding of Project Gutenberg HTML file. Exception: {ex}"
            )

        # Try to guess the ebook language
        pg_language = "en-US"
        if "colour" in pg_ebook_html or "favour" in pg_ebook_html or "honour" in pg_ebook_html:
            pg_language = "en-GB"

    # Create necessary directories
    (repo_path / "images").mkdir(parents=True)
    (repo_path / "src" / "epub" / "css").mkdir(parents=True)
    (repo_path / "src" / "epub" / "images").mkdir(parents=True)
    (repo_path / "src" / "epub" / "text").mkdir(parents=True)
    (repo_path / "src" / "META-INF").mkdir(parents=True)

    is_pg_html_parsed = True

    # Write PG data if we have it
    if args.pg_url and pg_ebook_html:
        try:
            dom = etree.parse(
                StringIO(regex.sub(r"encoding=\".+?\"", "", pg_ebook_html)),
                parser)
            namespaces = {"re": "http://exslt.org/regular-expressions"}

            for node in dom.xpath(
                    "//*[re:test(text(), '\\*\\*\\*\\s*Produced by.+')]",
                    namespaces=namespaces):
                producers_text = regex.sub(
                    r"^<[^>]+?>", "",
                    etree.tostring(node, encoding=str, with_tail=False))
                producers_text = regex.sub(r"<[^>]+?>$", "", producers_text)

                producers_text = regex.sub(r".+?Produced by (.+?)\s*$",
                                           "\\1",
                                           producers_text,
                                           flags=regex.DOTALL)
                producers_text = regex.sub(r"\(.+?\)",
                                           "",
                                           producers_text,
                                           flags=regex.DOTALL)
                producers_text = regex.sub(r"(at )?https?://www\.pgdp\.net",
                                           "",
                                           producers_text,
                                           flags=regex.DOTALL)
                producers_text = regex.sub(r"[\r\n]+",
                                           " ",
                                           producers_text,
                                           flags=regex.DOTALL)
                producers_text = regex.sub(r",? and ", ", and ",
                                           producers_text)
                producers_text = producers_text.replace(
                    " and the Online", " and The Online")
                producers_text = producers_text.replace(", and ", ", ").strip()

                pg_producers = [
                    producer.strip()
                    for producer in regex.split(',|;', producers_text)
                ]

            # Try to strip out the PG header
            for node in dom.xpath(
                    "//*[re:test(text(), '\\*\\*\\*\\s*START OF THIS')]",
                    namespaces=namespaces):
                for sibling_node in node.xpath("./preceding-sibling::*"):
                    easy_node = se.easy_xml.EasyXmlElement(sibling_node)
                    easy_node.remove()

                easy_node = se.easy_xml.EasyXmlElement(node)
                easy_node.remove()

            # Try to strip out the PG license footer
            for node in dom.xpath(
                    "//*[re:test(text(), 'End of (the )?Project Gutenberg')]",
                    namespaces=namespaces):
                for sibling_node in node.xpath("./following-sibling::*"):
                    easy_node = se.easy_xml.EasyXmlElement(sibling_node)
                    easy_node.remove()

                easy_node = se.easy_xml.EasyXmlElement(node)
                easy_node.remove()

            # lxml will but the xml declaration in a weird place, remove it first
            output = regex.sub(r"<\?xml.+?\?>", "",
                               etree.tostring(dom, encoding="unicode"))

            # Now re-add it
            output = """<?xml version="1.0" encoding="utf-8"?>\n""" + output

            # lxml can also output duplicate default namespace declarations so remove the first one only
            output = regex.sub(r"(xmlns=\".+?\")(\sxmlns=\".+?\")+", r"\1",
                               output)

            with open(repo_path / "src" / "epub" / "text" / "body.xhtml",
                      "w",
                      encoding="utf-8") as file:
                file.write(output)

        except OSError as ex:
            raise se.InvalidFileException(
                f"Couldn’t write to ebook directory. Exception: {ex}")
        except Exception as ex:
            # Save this error for later, because it's still useful to complete the create-draft process
            # even if we've failed to parse PG's HTML source.
            is_pg_html_parsed = False
            se.quiet_remove(repo_path / "src" / "epub" / "text" / "body.xhtml")

    # Copy over templates
    _copy_template_file("gitignore", repo_path / ".gitignore")
    _copy_template_file("LICENSE.md", repo_path)
    _copy_template_file("container.xml", repo_path / "src" / "META-INF")
    _copy_template_file("mimetype", repo_path / "src")
    _copy_template_file("content.opf", repo_path / "src" / "epub")
    _copy_template_file("onix.xml", repo_path / "src" / "epub")
    _copy_template_file("toc.xhtml", repo_path / "src" / "epub")
    _copy_template_file("core.css", repo_path / "src" / "epub" / "css")
    _copy_template_file("local.css", repo_path / "src" / "epub" / "css")
    _copy_template_file("se.css", repo_path / "src" / "epub" / "css")
    _copy_template_file("logo.svg", repo_path / "src" / "epub" / "images")
    _copy_template_file("colophon.xhtml", repo_path / "src" / "epub" / "text")
    _copy_template_file("imprint.xhtml", repo_path / "src" / "epub" / "text")
    _copy_template_file("titlepage.xhtml", repo_path / "src" / "epub" / "text")
    _copy_template_file("uncopyright.xhtml",
                        repo_path / "src" / "epub" / "text")
    _copy_template_file("titlepage.svg", repo_path / "images")
    _copy_template_file("cover.jpg", repo_path / "images" / "cover.jpg")
    _copy_template_file("cover.svg", repo_path / "images" / "cover.svg")

    # Try to find Wikipedia links if possible
    ebook_wiki_url = None

    if not args.offline and title != "Short Fiction":
        # There's a "Short Fiction" Wikipedia article, so make an exception for that case
        ebook_wiki_url, _ = _get_wikipedia_url(title, False)

    # Pre-fill a few templates
    _replace_in_file(repo_path / "src" / "epub" / "text" / "titlepage.xhtml",
                     "TITLE_STRING", title_string)
    _replace_in_file(repo_path / "images" / "titlepage.svg", "TITLE_STRING",
                     title_string)
    _replace_in_file(repo_path / "images" / "cover.svg", "TITLE_STRING",
                     title_string)

    # Create the titlepage SVG
    contributors = {}
    if args.translator:
        contributors["translated by"] = _generate_contributor_string(
            translators, False)

    if args.illustrator:
        contributors["illustrated by"] = _generate_contributor_string(
            illustrators, False)

    with open(repo_path / "images" / "titlepage.svg", "w",
              encoding="utf-8") as file:
        file.write(
            _generate_titlepage_svg(title,
                                    [author["name"] for author in authors],
                                    contributors, title_string))

    # Create the cover SVG
    with open(repo_path / "images" / "cover.svg", "w",
              encoding="utf-8") as file:
        file.write(
            _generate_cover_svg(title, [author["name"] for author in authors],
                                title_string))

    # Build the cover/titlepage for distribution
    epub = SeEpub(repo_path)
    epub.generate_cover_svg()
    epub.generate_titlepage_svg()

    if args.pg_url:
        _replace_in_file(repo_path / "src" / "epub" / "text" / "imprint.xhtml",
                         "PG_URL", args.pg_url)

    # Fill out the colophon
    with open(repo_path / "src" / "epub" / "text" / "colophon.xhtml",
              "r+",
              encoding="utf-8") as file:
        colophon_xhtml = file.read()

        colophon_xhtml = colophon_xhtml.replace("SE_IDENTIFIER", identifier)
        colophon_xhtml = colophon_xhtml.replace("TITLE", title)

        contributor_string = _generate_contributor_string(authors, True)

        if contributor_string == "":
            colophon_xhtml = colophon_xhtml.replace(
                " by<br/>\n\t\t\t<a href=\"AUTHOR_WIKI_URL\">AUTHOR</a>",
                contributor_string)
        else:
            colophon_xhtml = colophon_xhtml.replace(
                "<a href=\"AUTHOR_WIKI_URL\">AUTHOR</a>", contributor_string)

        if translators:
            translator_block = f"It was translated from ORIGINAL_LANGUAGE in TRANSLATION_YEAR by<br/>\n\t\t\t{_generate_contributor_string(translators, True)}.</p>"
            colophon_xhtml = colophon_xhtml.replace(
                "</p>\n\t\t\t<p>This ebook was produced for the<br/>",
                f"<br/>\n\t\t\t{translator_block}\n\t\t\t<p>This ebook was produced for the<br/>"
            )

        if args.pg_url:
            colophon_xhtml = colophon_xhtml.replace("PG_URL", args.pg_url)

            if pg_publication_year:
                colophon_xhtml = colophon_xhtml.replace(
                    "PG_YEAR", pg_publication_year)

            if pg_producers:
                producers_xhtml = ""
                for i, producer in enumerate(pg_producers):
                    if "Distributed Proofread" in producer:
                        producers_xhtml = producers_xhtml + "<a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a>"
                    elif "anonymous" in producer.lower():
                        producers_xhtml = producers_xhtml + "<b class=\"name\">An Anonymous Volunteer</b>"
                    else:
                        producers_xhtml = producers_xhtml + f"<b class=\"name\">{_add_name_abbr(producer).strip('.')}</b>"

                    if i < len(pg_producers) - 1:
                        producers_xhtml = producers_xhtml + ", "

                    if i == len(pg_producers) - 2:
                        producers_xhtml = producers_xhtml + "and "

                producers_xhtml = producers_xhtml + "<br/>"

                colophon_xhtml = colophon_xhtml.replace(
                    "<b class=\"name\">TRANSCRIBER_1</b>, <b class=\"name\">TRANSCRIBER_2</b>, and <a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a><br/>",
                    producers_xhtml)

        file.seek(0)
        file.write(colophon_xhtml)
        file.truncate()

    # Fill out the metadata file
    with open(repo_path / "src" / "epub" / "content.opf",
              "r+",
              encoding="utf-8") as file:
        metadata_xml = file.read()

        metadata_xml = metadata_xml.replace("SE_IDENTIFIER", identifier)
        metadata_xml = metadata_xml.replace(">TITLE_SORT<",
                                            f">{sorted_title}<")
        metadata_xml = metadata_xml.replace(">TITLE<", f">{title}<")
        metadata_xml = metadata_xml.replace("VCS_IDENTIFIER", str(repo_name))

        if pg_producers:
            producers_xhtml = ""
            i = 1
            for producer in pg_producers:
                if "Distributed Proofread" in producer:
                    producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">The Online Distributed Proofreading Team</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Online Distributed Proofreading Team, The</meta>\n\t\t<meta property=\"se:url.homepage\" refines=\"#transcriber-{i}\">https://pgdp.net</meta>\n"
                elif "anonymous" in producer.lower():
                    producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">An Anonymous Volunteer</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Anonymous Volunteer, An</meta>\n"
                else:
                    producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">{producer.strip('.')}</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">TRANSCRIBER_SORT</meta>\n"

                producers_xhtml = producers_xhtml + f"\t\t<meta property=\"role\" refines=\"#transcriber-{i}\" scheme=\"marc:relators\">trc</meta>\n"

                i = i + 1

            metadata_xml = regex.sub(
                r"\t\t<dc:contributor id=\"transcriber-1\">TRANSCRIBER</dc:contributor>\s*<meta property=\"file-as\" refines=\"#transcriber-1\">TRANSCRIBER_SORT</meta>\s*<meta property=\"se:url.homepage\" refines=\"#transcriber-1\">TRANSCRIBER_URL</meta>\s*<meta property=\"role\" refines=\"#transcriber-1\" scheme=\"marc:relators\">trc</meta>",
                "\t\t" + producers_xhtml.strip(),
                metadata_xml,
                flags=regex.DOTALL)

        if ebook_wiki_url:
            metadata_xml = metadata_xml.replace(">EBOOK_WIKI_URL<",
                                                f">{ebook_wiki_url}<")

        authors_xml = _generate_metadata_contributor_xml(authors, "author")
        authors_xml = authors_xml.replace("dc:contributor", "dc:creator")
        metadata_xml = regex.sub(
            r"<dc:creator id=\"author\">AUTHOR</dc:creator>.+?scheme=\"marc:relators\">aut</meta>",
            authors_xml,
            metadata_xml,
            flags=regex.DOTALL)

        if translators:
            translators_xml = _generate_metadata_contributor_xml(
                translators, "translator")
            metadata_xml = regex.sub(
                r"<dc:contributor id=\"translator\">.+?scheme=\"marc:relators\">trl</meta>",
                translators_xml,
                metadata_xml,
                flags=regex.DOTALL)
        else:
            metadata_xml = regex.sub(
                r"<dc:contributor id=\"translator\">.+?scheme=\"marc:relators\">trl</meta>\n\t\t",
                "",
                metadata_xml,
                flags=regex.DOTALL)

        if illustrators:
            illustrators_xml = _generate_metadata_contributor_xml(
                illustrators, "illustrator")
            metadata_xml = regex.sub(
                r"<dc:contributor id=\"illustrator\">.+?scheme=\"marc:relators\">ill</meta>",
                illustrators_xml,
                metadata_xml,
                flags=regex.DOTALL)
        else:
            metadata_xml = regex.sub(
                r"<dc:contributor id=\"illustrator\">.+?scheme=\"marc:relators\">ill</meta>\n\t\t",
                "",
                metadata_xml,
                flags=regex.DOTALL)

        if args.pg_url:
            if pg_subjects:
                subject_xhtml = ""

                i = 1
                for subject in pg_subjects:
                    subject_xhtml = subject_xhtml + f"\t\t<dc:subject id=\"subject-{i}\">{subject}</dc:subject>\n"
                    i = i + 1

                i = 1
                for subject in pg_subjects:
                    subject_xhtml = subject_xhtml + f"\t\t<meta property=\"authority\" refines=\"#subject-{i}\">LCSH</meta>\n"

                    # Now, get the LCSH ID by querying LCSH directly.
                    try:
                        response = requests.get(
                            f"https://id.loc.gov/search/?q=%22{urllib.parse.quote(subject)}%22"
                        )
                        result = regex.search(
                            fr"<a title=\"Click to view record\" href=\"/authorities/subjects/([^\"]+?)\">{regex.escape(subject.replace(' -- ', '--'))}</a>",
                            response.text)

                        loc_id = "Unknown"
                        try:
                            loc_id = result.group(1)
                        except Exception as ex:
                            pass

                        subject_xhtml = subject_xhtml + f"\t\t<meta property=\"term\" refines=\"#subject-{i}\">{loc_id}</meta>\n"

                    except Exception as ex:
                        raise se.RemoteCommandErrorException(
                            f"Couldn’t connect to [url][link=https://id.loc.gov]https://id.loc.gov[/][/]. Exception: {ex}"
                        )

                    i = i + 1

                metadata_xml = regex.sub(
                    r"\t\t<dc:subject id=\"subject-1\">SUBJECT_1</dc:subject>\s*<dc:subject id=\"subject-2\">SUBJECT_2</dc:subject>\s*<meta property=\"authority\" refines=\"#subject-1\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-1\">LCSH_ID_1</meta>\s*<meta property=\"authority\" refines=\"#subject-2\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-2\">LCSH_ID_2</meta>",
                    "\t\t" + subject_xhtml.strip(), metadata_xml)

            metadata_xml = metadata_xml.replace(
                "<dc:language>LANG</dc:language>",
                f"<dc:language>{pg_language}</dc:language>")
            metadata_xml = metadata_xml.replace(
                "<dc:source>PG_URL</dc:source>",
                f"<dc:source>{args.pg_url}</dc:source>")

        file.seek(0)
        file.write(metadata_xml)
        file.truncate()

    # Set up local git repo
    repo = git.Repo.init(repo_path)

    if args.email:
        with repo.config_writer() as config:
            config.set_value("user", "email", args.email)

    if args.pg_url and pg_ebook_html and not is_pg_html_parsed:
        raise se.InvalidXhtmlException(
            "Couldn’t parse Project Gutenberg ebook source. This is usually due to invalid HTML in the ebook."
        )
示例#23
0
def process_headings(dom: EasyXmlTree, textf: str, toc_list: list,
                     single_file: bool,
                     single_file_without_headers: bool) -> None:
    """
	Find headings in current file and extract title data
	into items added to toc_list.

	INPUTS:
	dom: an EasyXmlTree representation of the current file
	textf: the path to the file
	toc_list: the list of ToC items we are building
	single_file: is there only a single content item in the production?

	OUTPUTS:
	None
	"""

    body = dom.xpath("//body")
    place = Position.NONE
    if body:
        place = get_place(body[0])
    else:
        raise se.InvalidInputException("Couldn’t locate [xhtml]<body>[/].")

    is_toplevel = True

    # Find all the hgroups and h1, h2 etc headings.
    heads = dom.xpath("//hgroup | //h1 | //h2 | //h3 | //h4 | //h5 | //h6")

    # special treatment where we can't find any header or hgroups
    if not heads:  # May be a dedication or an epigraph, with no heading tag.
        special_item = TocItem()
        # Need to determine level depth.
        # We don't have a heading, so get first content item
        content_item = dom.xpath("//p | //header | //img")
        if content_item is not None:
            special_item.level = get_level(content_item[0], toc_list)
        special_item.title = dom.xpath(
            "//head/title/text()",
            True)  # Use the page title as the ToC entry title.
        if special_item.title is None:
            special_item.title = "NO TITLE"
        special_item.file_link = textf
        special_item.place = place
        toc_list.append(special_item)
        return

    for heading in heads:
        # don't process a heading separately if it's within a hgroup
        if heading.parent.tag == "hgroup":
            continue  # skip it

        if place == Position.BODY:
            toc_item = process_a_heading(heading, textf, is_toplevel,
                                         single_file)
        else:
            # if it's not a bodymatter item we don't care about whether it's single_file
            toc_item = process_a_heading(heading, textf, is_toplevel, False)

        toc_item.level = get_level(heading, toc_list)
        toc_item.place = place

        # Exception: The titlepage always has is titled 'titlepage' in the ToC
        if dom.xpath("//section[re:test(@epub:type, '\\btitlepage\\b')]"):
            toc_item.title = "Titlepage"

        # Exception: If there is only a single body item WITHOUT HEADERS (like Father Goriot or The Path to Rome),
        # the half title page is listed as "Half-Titlepage" instead of the work title,
        # so that we don't duplicate the work title in the ToC. We always include a link to the work body
        # in the ToC because readers on the web version need to have access to the text starting point, since
        # there are no back/forward nav buttons in XHTML files served on the web.
        if single_file_without_headers and dom.xpath(
                "//section[re:test(@epub:type, '\\bhalftitlepage\\b')]"):
            toc_item.title = "Half-Titlepage"

        is_toplevel = False
        toc_list.append(toc_item)