Exemplo n.º 1
0
    def _endnotes_dom(self) -> se.easy_xml.EasyXhtmlTree:
        """
		Accessor

		Return an EasyXmlTree object representing the endnotes.xhtml file for this ebook.

		INPUTS
		None

		OUTPUTS
		A EasyXmlTree object representing the endnotes.xhtml file for this ebook.
		"""

        if not self.__endnotes_dom:
            try:
                with open(self.path / "src" / "epub" / "text" /
                          "endnotes.xhtml") as file:
                    self.__endnotes_dom = se.formatting.EasyXhtmlTree(
                        file.read())
            except Exception as ex:
                raise se.InvalidFileException(
                    f"Could't open file: [path][link=file://{self.path / 'src' / 'epub' / 'text' / 'endnotes.xhtml'}]{self.path / 'src' / 'epub' / 'text' / 'endnotes.xhtml'}[/][/]."
                ) from ex

        return self.__endnotes_dom
Exemplo n.º 2
0
    def _endnotes_soup(self) -> BeautifulSoup:
        """
		Accessor

		Return a BeautifulSoup object representing the endnotes.xhtml file for this ebook.

		INPUTS
		None

		OUTPUTS
		A BeautifulSoup object representing the endnotes.xhtml file for this ebook.
		"""

        if not self.__endnotes_soup:
            try:
                with open(self.path / "src" / "epub" / "text" /
                          "endnotes.xhtml") as file:
                    self.__endnotes_soup = BeautifulSoup(
                        file.read(), "html.parser")
            except:
                raise se.InvalidFileException("Could't open file: {}".format(
                    str(self.path / "src" / "epub" / "text" /
                        "endnotes.xhtml")))

        return self.__endnotes_soup
Exemplo n.º 3
0
    def _endnotes_soup(self) -> BeautifulSoup:
        """
		Accessor

		Return a BeautifulSoup object representing the endnotes.xhtml file for this ebook.

		INPUTS
		None

		OUTPUTS
		A BeautifulSoup object representing the endnotes.xhtml file for this ebook.
		"""

        if not self.__endnotes_soup:
            try:
                with open(self.path / "src" / "epub" / "text" /
                          "endnotes.xhtml") as file:
                    self.__endnotes_soup = BeautifulSoup(
                        file.read(), "html.parser")
            except:
                raise se.InvalidFileException(
                    f"Could't open file: [path][link=file://{self.path / 'src' / 'epub' / 'text' / 'endnotes.xhtml'}]{self.path / 'src' / 'epub' / 'text' / 'endnotes.xhtml'}[/][/]."
                )

        return self.__endnotes_soup
Exemplo n.º 4
0
def process_all_content(file_list: list, text_path: str) -> Tuple[list, list]:
    """
	Analyze the whole content of the project, build and return lists
	if toc_items and landmarks.

	INPUTS:
	file_list: a list of all content files
	text_path: the path to the contents folder (src/epub/text)

	OUTPUTS:
	a tuple containing the list of Toc items and the list of landmark items
	"""

    toc_list: List[TocItem] = []
    landmarks: List[TocItem] = []

    # We make two passes through the work, because we need to know
    # how many bodymatter items there are. So we do landmarks first.
    for textf in file_list:
        file_path = Path(text_path) / textf
        try:
            with open(file_path, encoding="utf8") as file:
                dom = se.easy_xml.EasyXhtmlTree(file.read())
        except Exception as ex:
            raise se.InvalidFileException(
                f"Couldn’t open file: [path][link=file://{file_path}]{file_path}[/][/]. Exception: {ex}"
            ) from ex

        add_landmark(dom, textf, landmarks)

    # Now we test to see if there is only one body item
    body_items = [item for item in landmarks if item.place == Position.BODY]
    single_file = (len(body_items) == 1)

    nest_under_halftitle = False
    place = Position.NONE
    for textf in file_list:
        with open(Path(text_path) / textf, "r", encoding="utf-8") as file:
            dom = se.easy_xml.EasyXhtmlTree(file.read())
        body = dom.xpath("//body")
        if body:
            place = get_place(body[0])
        else:
            raise se.InvalidInputException("Couldn't locate body node")
        if place == Position.BACK:
            nest_under_halftitle = False
        process_headings(dom, textf, toc_list, nest_under_halftitle,
                         single_file)
        if textf == "halftitle.xhtml":
            nest_under_halftitle = True

    # We add this dummy item because outputtoc always needs to look ahead to the next item.
    last_toc = TocItem()
    last_toc.level = 1
    last_toc.title = "dummy"
    toc_list.append(last_toc)

    return landmarks, toc_list
Exemplo n.º 5
0
def has_transparency(filename: Path) -> bool:
    """
	Return True if the given image file has transparency
	"""

    try:
        image = Image.open(filename)
    except UnidentifiedImageError as ex:
        raise se.InvalidFileException(
            f"Couldn’t identify image type of [path][link=file://{filename.resolve()}]{filename}[/]."
        ) from ex

    if image.mode == "P":
        transparent = image.info.get("transparency", -1)
        for _, index in image.getcolors():
            if index == transparent:
                return True
    elif image.mode == "RGBA":
        extrema = image.getextrema()
        if extrema[3][0] < 255:
            return True

    return False
Exemplo n.º 6
0
	def get_file(self, file_path: Path) -> str:
		"""
		Get raw file contents of a file in the epub.
		Contents are cached so that we don't hit the disk repeatedly

		INPUTS
		file_path: A Path pointing to the file

		OUTPUTS
		A string representing the file contents
		"""

		file_path_str = str(file_path)

		if file_path_str not in self._file_cache:
			try:
				with open(file_path, "r", encoding="utf-8") as file:
					file_contents = file.read()
			except Exception as ex:
				raise se.InvalidFileException(f"Couldn’t read file: [path]{file_path_str}[/]") from ex

			self._file_cache[file_path_str] = file_contents

		return self._file_cache[file_path_str]
Exemplo n.º 7
0
def _create_draft(args: Namespace):
    """
	Implementation for `se create-draft`
	"""

    # Put together some variables for later use
    authors = []
    translators = []
    illustrators = []
    pg_producers = []
    title = args.title.replace("'", "’")

    for author in args.author:
        authors.append({
            "name": author.replace("'", "’"),
            "wiki_url": None,
            "nacoaf_url": None
        })

    if args.translator:
        for translator in args.translator:
            translators.append({
                "name": translator.replace("'", "’"),
                "wiki_url": None,
                "nacoaf_url": None
            })

    if args.illustrator:
        for illustrator in args.illustrator:
            illustrators.append({
                "name": illustrator.replace("'", "’"),
                "wiki_url": None,
                "nacoaf_url": None
            })

    title_string = title
    if authors and authors[0]["name"].lower() != "anonymous":
        title_string += ", by " + _generate_contributor_string(authors, False)

    identifier = ""
    for author in authors:
        identifier += se.formatting.make_url_safe(author["name"]) + "_"

    identifier = identifier.rstrip("_") + "/" + se.formatting.make_url_safe(
        title)

    sorted_title = regex.sub(r"^(A|An|The) (.+)$", "\\2, \\1", title)

    if translators:
        title_string = title_string + ". Translated by " + _generate_contributor_string(
            translators, False)

        identifier = identifier + "/"

        for translator in translators:
            identifier += se.formatting.make_url_safe(translator["name"]) + "_"

        identifier = identifier.rstrip("_")

    if illustrators:
        title_string = title_string + ". Illustrated by " + _generate_contributor_string(
            illustrators, False)

        identifier = identifier + "/"

        for illustrator in illustrators:
            identifier += se.formatting.make_url_safe(
                illustrator["name"]) + "_"

        identifier = identifier.rstrip("_")

    repo_name = identifier.replace("/", "_")

    repo_path = Path(repo_name).resolve()

    if repo_path.is_dir():
        raise se.InvalidInputException(
            f"Directory already exists: [path][link=file://{repo_path}]{repo_path}[/][/]."
        )

    # Get data on authors
    for i, author in enumerate(authors):
        if not args.offline and author["name"].lower() != "anonymous":
            author["wiki_url"], author["nacoaf_url"] = _get_wikipedia_url(
                author["name"], True)

    # Get data on translators
    for i, translator in enumerate(translators):
        if not args.offline and translator["name"].lower() != "anonymous":
            translator["wiki_url"], translator[
                "nacoaf_url"] = _get_wikipedia_url(translator["name"], True)

    # Get data on illlustrators
    for i, illustrator in enumerate(illustrators):
        if not args.offline and illustrator["name"].lower() != "anonymous":
            illustrator["wiki_url"], illustrator[
                "nacoaf_url"] = _get_wikipedia_url(illustrator["name"], True)

    # Download PG HTML and do some fixups
    if args.pg_url:
        if args.offline:
            raise se.RemoteCommandErrorException(
                "Cannot download Project Gutenberg ebook when offline option is enabled."
            )

        args.pg_url = args.pg_url.replace("http://", "https://")

        # Get the ebook metadata
        try:
            response = requests.get(args.pg_url)
            pg_metadata_html = response.text
        except Exception as ex:
            raise se.RemoteCommandErrorException(
                f"Couldn’t download Project Gutenberg ebook metadata page. Exception: {ex}"
            )

        parser = etree.HTMLParser()
        dom = etree.parse(StringIO(pg_metadata_html), parser)

        # Get the ebook HTML URL from the metadata
        pg_ebook_url = None
        for node in dom.xpath("/html/body//a[contains(@type, 'text/html')]"):
            pg_ebook_url = regex.sub(r"^//", "https://", node.get("href"))
            pg_ebook_url = regex.sub(r"^/", "https://www.gutenberg.org/",
                                     pg_ebook_url)

        if not pg_ebook_url:
            raise se.RemoteCommandErrorException(
                "Could download ebook metadata, but couldn’t find URL for the ebook HTML."
            )

        # Get the ebook LCSH categories
        pg_subjects = []
        for node in dom.xpath(
                "/html/body//td[contains(@property, 'dcterms:subject')]"):
            if node.get("datatype") == "dcterms:LCSH":
                for subject_link in node.xpath("./a"):
                    pg_subjects.append(subject_link.text.strip())

        # Get the PG publication date
        pg_publication_year = None
        for node in dom.xpath("//td[@itemprop='datePublished']"):
            pg_publication_year = regex.sub(r".+?([0-9]{4})", "\\1", node.text)

        # Get the actual ebook URL
        try:
            response = requests.get(pg_ebook_url)
            pg_ebook_html = response.text
        except Exception as ex:
            raise se.RemoteCommandErrorException(
                f"Couldn’t download Project Gutenberg ebook HTML. Exception: {ex}"
            )

        try:
            fixed_pg_ebook_html = fix_text(pg_ebook_html, uncurl_quotes=False)
            pg_ebook_html = se.strip_bom(fixed_pg_ebook_html)
        except Exception as ex:
            raise se.InvalidEncodingException(
                f"Couldn’t determine text encoding of Project Gutenberg HTML file. Exception: {ex}"
            )

        # Try to guess the ebook language
        pg_language = "en-US"
        if "colour" in pg_ebook_html or "favour" in pg_ebook_html or "honour" in pg_ebook_html:
            pg_language = "en-GB"

    # Create necessary directories
    (repo_path / "images").mkdir(parents=True)
    (repo_path / "src" / "epub" / "css").mkdir(parents=True)
    (repo_path / "src" / "epub" / "images").mkdir(parents=True)
    (repo_path / "src" / "epub" / "text").mkdir(parents=True)
    (repo_path / "src" / "META-INF").mkdir(parents=True)

    is_pg_html_parsed = True

    # Write PG data if we have it
    if args.pg_url and pg_ebook_html:
        try:
            dom = etree.parse(
                StringIO(regex.sub(r"encoding=\".+?\"", "", pg_ebook_html)),
                parser)
            namespaces = {"re": "http://exslt.org/regular-expressions"}

            for node in dom.xpath(
                    "//*[re:test(text(), '\\*\\*\\*\\s*Produced by.+')]",
                    namespaces=namespaces):
                producers_text = regex.sub(
                    r"^<[^>]+?>", "",
                    etree.tostring(node, encoding=str, with_tail=False))
                producers_text = regex.sub(r"<[^>]+?>$", "", producers_text)

                producers_text = regex.sub(r".+?Produced by (.+?)\s*$",
                                           "\\1",
                                           producers_text,
                                           flags=regex.DOTALL)
                producers_text = regex.sub(r"\(.+?\)",
                                           "",
                                           producers_text,
                                           flags=regex.DOTALL)
                producers_text = regex.sub(r"(at )?https?://www\.pgdp\.net",
                                           "",
                                           producers_text,
                                           flags=regex.DOTALL)
                producers_text = regex.sub(r"[\r\n]+",
                                           " ",
                                           producers_text,
                                           flags=regex.DOTALL)
                producers_text = regex.sub(r",? and ", ", and ",
                                           producers_text)
                producers_text = producers_text.replace(
                    " and the Online", " and The Online")
                producers_text = producers_text.replace(", and ", ", ").strip()

                pg_producers = [
                    producer.strip()
                    for producer in regex.split(',|;', producers_text)
                ]

            # Try to strip out the PG header
            for node in dom.xpath(
                    "//*[re:test(text(), '\\*\\*\\*\\s*START OF THIS')]",
                    namespaces=namespaces):
                for sibling_node in node.xpath("./preceding-sibling::*"):
                    easy_node = se.easy_xml.EasyXmlElement(sibling_node)
                    easy_node.remove()

                easy_node = se.easy_xml.EasyXmlElement(node)
                easy_node.remove()

            # Try to strip out the PG license footer
            for node in dom.xpath(
                    "//*[re:test(text(), 'End of (the )?Project Gutenberg')]",
                    namespaces=namespaces):
                for sibling_node in node.xpath("./following-sibling::*"):
                    easy_node = se.easy_xml.EasyXmlElement(sibling_node)
                    easy_node.remove()

                easy_node = se.easy_xml.EasyXmlElement(node)
                easy_node.remove()

            # lxml will but the xml declaration in a weird place, remove it first
            output = regex.sub(r"<\?xml.+?\?>", "",
                               etree.tostring(dom, encoding="unicode"))

            # Now re-add it
            output = """<?xml version="1.0" encoding="utf-8"?>\n""" + output

            # lxml can also output duplicate default namespace declarations so remove the first one only
            output = regex.sub(r"(xmlns=\".+?\")(\sxmlns=\".+?\")+", r"\1",
                               output)

            with open(repo_path / "src" / "epub" / "text" / "body.xhtml",
                      "w",
                      encoding="utf-8") as file:
                file.write(output)

        except OSError as ex:
            raise se.InvalidFileException(
                f"Couldn’t write to ebook directory. Exception: {ex}")
        except Exception as ex:
            # Save this error for later, because it's still useful to complete the create-draft process
            # even if we've failed to parse PG's HTML source.
            is_pg_html_parsed = False
            se.quiet_remove(repo_path / "src" / "epub" / "text" / "body.xhtml")

    # Copy over templates
    _copy_template_file("gitignore", repo_path / ".gitignore")
    _copy_template_file("LICENSE.md", repo_path)
    _copy_template_file("container.xml", repo_path / "src" / "META-INF")
    _copy_template_file("mimetype", repo_path / "src")
    _copy_template_file("content.opf", repo_path / "src" / "epub")
    _copy_template_file("onix.xml", repo_path / "src" / "epub")
    _copy_template_file("toc.xhtml", repo_path / "src" / "epub")
    _copy_template_file("core.css", repo_path / "src" / "epub" / "css")
    _copy_template_file("local.css", repo_path / "src" / "epub" / "css")
    _copy_template_file("se.css", repo_path / "src" / "epub" / "css")
    _copy_template_file("logo.svg", repo_path / "src" / "epub" / "images")
    _copy_template_file("colophon.xhtml", repo_path / "src" / "epub" / "text")
    _copy_template_file("imprint.xhtml", repo_path / "src" / "epub" / "text")
    _copy_template_file("titlepage.xhtml", repo_path / "src" / "epub" / "text")
    _copy_template_file("uncopyright.xhtml",
                        repo_path / "src" / "epub" / "text")
    _copy_template_file("titlepage.svg", repo_path / "images")
    _copy_template_file("cover.jpg", repo_path / "images" / "cover.jpg")
    _copy_template_file("cover.svg", repo_path / "images" / "cover.svg")

    # Try to find Wikipedia links if possible
    ebook_wiki_url = None

    if not args.offline and title != "Short Fiction":
        # There's a "Short Fiction" Wikipedia article, so make an exception for that case
        ebook_wiki_url, _ = _get_wikipedia_url(title, False)

    # Pre-fill a few templates
    _replace_in_file(repo_path / "src" / "epub" / "text" / "titlepage.xhtml",
                     "TITLE_STRING", title_string)
    _replace_in_file(repo_path / "images" / "titlepage.svg", "TITLE_STRING",
                     title_string)
    _replace_in_file(repo_path / "images" / "cover.svg", "TITLE_STRING",
                     title_string)

    # Create the titlepage SVG
    contributors = {}
    if args.translator:
        contributors["translated by"] = _generate_contributor_string(
            translators, False)

    if args.illustrator:
        contributors["illustrated by"] = _generate_contributor_string(
            illustrators, False)

    with open(repo_path / "images" / "titlepage.svg", "w",
              encoding="utf-8") as file:
        file.write(
            _generate_titlepage_svg(title,
                                    [author["name"] for author in authors],
                                    contributors, title_string))

    # Create the cover SVG
    with open(repo_path / "images" / "cover.svg", "w",
              encoding="utf-8") as file:
        file.write(
            _generate_cover_svg(title, [author["name"] for author in authors],
                                title_string))

    # Build the cover/titlepage for distribution
    epub = SeEpub(repo_path)
    epub.generate_cover_svg()
    epub.generate_titlepage_svg()

    if args.pg_url:
        _replace_in_file(repo_path / "src" / "epub" / "text" / "imprint.xhtml",
                         "PG_URL", args.pg_url)

    # Fill out the colophon
    with open(repo_path / "src" / "epub" / "text" / "colophon.xhtml",
              "r+",
              encoding="utf-8") as file:
        colophon_xhtml = file.read()

        colophon_xhtml = colophon_xhtml.replace("SE_IDENTIFIER", identifier)
        colophon_xhtml = colophon_xhtml.replace("TITLE", title)

        contributor_string = _generate_contributor_string(authors, True)

        if contributor_string == "":
            colophon_xhtml = colophon_xhtml.replace(
                " by<br/>\n\t\t\t<a href=\"AUTHOR_WIKI_URL\">AUTHOR</a>",
                contributor_string)
        else:
            colophon_xhtml = colophon_xhtml.replace(
                "<a href=\"AUTHOR_WIKI_URL\">AUTHOR</a>", contributor_string)

        if translators:
            translator_block = f"It was translated from ORIGINAL_LANGUAGE in TRANSLATION_YEAR by<br/>\n\t\t\t{_generate_contributor_string(translators, True)}.</p>"
            colophon_xhtml = colophon_xhtml.replace(
                "</p>\n\t\t\t<p>This ebook was produced for the<br/>",
                f"<br/>\n\t\t\t{translator_block}\n\t\t\t<p>This ebook was produced for the<br/>"
            )

        if args.pg_url:
            colophon_xhtml = colophon_xhtml.replace("PG_URL", args.pg_url)

            if pg_publication_year:
                colophon_xhtml = colophon_xhtml.replace(
                    "PG_YEAR", pg_publication_year)

            if pg_producers:
                producers_xhtml = ""
                for i, producer in enumerate(pg_producers):
                    if "Distributed Proofread" in producer:
                        producers_xhtml = producers_xhtml + "<a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a>"
                    elif "anonymous" in producer.lower():
                        producers_xhtml = producers_xhtml + "<b class=\"name\">An Anonymous Volunteer</b>"
                    else:
                        producers_xhtml = producers_xhtml + f"<b class=\"name\">{_add_name_abbr(producer).strip('.')}</b>"

                    if i < len(pg_producers) - 1:
                        producers_xhtml = producers_xhtml + ", "

                    if i == len(pg_producers) - 2:
                        producers_xhtml = producers_xhtml + "and "

                producers_xhtml = producers_xhtml + "<br/>"

                colophon_xhtml = colophon_xhtml.replace(
                    "<b class=\"name\">TRANSCRIBER_1</b>, <b class=\"name\">TRANSCRIBER_2</b>, and <a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a><br/>",
                    producers_xhtml)

        file.seek(0)
        file.write(colophon_xhtml)
        file.truncate()

    # Fill out the metadata file
    with open(repo_path / "src" / "epub" / "content.opf",
              "r+",
              encoding="utf-8") as file:
        metadata_xml = file.read()

        metadata_xml = metadata_xml.replace("SE_IDENTIFIER", identifier)
        metadata_xml = metadata_xml.replace(">TITLE_SORT<",
                                            f">{sorted_title}<")
        metadata_xml = metadata_xml.replace(">TITLE<", f">{title}<")
        metadata_xml = metadata_xml.replace("VCS_IDENTIFIER", str(repo_name))

        if pg_producers:
            producers_xhtml = ""
            i = 1
            for producer in pg_producers:
                if "Distributed Proofread" in producer:
                    producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">The Online Distributed Proofreading Team</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Online Distributed Proofreading Team, The</meta>\n\t\t<meta property=\"se:url.homepage\" refines=\"#transcriber-{i}\">https://pgdp.net</meta>\n"
                elif "anonymous" in producer.lower():
                    producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">An Anonymous Volunteer</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Anonymous Volunteer, An</meta>\n"
                else:
                    producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">{producer.strip('.')}</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">TRANSCRIBER_SORT</meta>\n"

                producers_xhtml = producers_xhtml + f"\t\t<meta property=\"role\" refines=\"#transcriber-{i}\" scheme=\"marc:relators\">trc</meta>\n"

                i = i + 1

            metadata_xml = regex.sub(
                r"\t\t<dc:contributor id=\"transcriber-1\">TRANSCRIBER</dc:contributor>\s*<meta property=\"file-as\" refines=\"#transcriber-1\">TRANSCRIBER_SORT</meta>\s*<meta property=\"se:url.homepage\" refines=\"#transcriber-1\">TRANSCRIBER_URL</meta>\s*<meta property=\"role\" refines=\"#transcriber-1\" scheme=\"marc:relators\">trc</meta>",
                "\t\t" + producers_xhtml.strip(),
                metadata_xml,
                flags=regex.DOTALL)

        if ebook_wiki_url:
            metadata_xml = metadata_xml.replace(">EBOOK_WIKI_URL<",
                                                f">{ebook_wiki_url}<")

        authors_xml = _generate_metadata_contributor_xml(authors, "author")
        authors_xml = authors_xml.replace("dc:contributor", "dc:creator")
        metadata_xml = regex.sub(
            r"<dc:creator id=\"author\">AUTHOR</dc:creator>.+?scheme=\"marc:relators\">aut</meta>",
            authors_xml,
            metadata_xml,
            flags=regex.DOTALL)

        if translators:
            translators_xml = _generate_metadata_contributor_xml(
                translators, "translator")
            metadata_xml = regex.sub(
                r"<dc:contributor id=\"translator\">.+?scheme=\"marc:relators\">trl</meta>",
                translators_xml,
                metadata_xml,
                flags=regex.DOTALL)
        else:
            metadata_xml = regex.sub(
                r"<dc:contributor id=\"translator\">.+?scheme=\"marc:relators\">trl</meta>\n\t\t",
                "",
                metadata_xml,
                flags=regex.DOTALL)

        if illustrators:
            illustrators_xml = _generate_metadata_contributor_xml(
                illustrators, "illustrator")
            metadata_xml = regex.sub(
                r"<dc:contributor id=\"illustrator\">.+?scheme=\"marc:relators\">ill</meta>",
                illustrators_xml,
                metadata_xml,
                flags=regex.DOTALL)
        else:
            metadata_xml = regex.sub(
                r"<dc:contributor id=\"illustrator\">.+?scheme=\"marc:relators\">ill</meta>\n\t\t",
                "",
                metadata_xml,
                flags=regex.DOTALL)

        if args.pg_url:
            if pg_subjects:
                subject_xhtml = ""

                i = 1
                for subject in pg_subjects:
                    subject_xhtml = subject_xhtml + f"\t\t<dc:subject id=\"subject-{i}\">{subject}</dc:subject>\n"
                    i = i + 1

                i = 1
                for subject in pg_subjects:
                    subject_xhtml = subject_xhtml + f"\t\t<meta property=\"authority\" refines=\"#subject-{i}\">LCSH</meta>\n"

                    # Now, get the LCSH ID by querying LCSH directly.
                    try:
                        response = requests.get(
                            f"https://id.loc.gov/search/?q=%22{urllib.parse.quote(subject)}%22"
                        )
                        result = regex.search(
                            fr"<a title=\"Click to view record\" href=\"/authorities/subjects/([^\"]+?)\">{regex.escape(subject.replace(' -- ', '--'))}</a>",
                            response.text)

                        loc_id = "Unknown"
                        try:
                            loc_id = result.group(1)
                        except Exception as ex:
                            pass

                        subject_xhtml = subject_xhtml + f"\t\t<meta property=\"term\" refines=\"#subject-{i}\">{loc_id}</meta>\n"

                    except Exception as ex:
                        raise se.RemoteCommandErrorException(
                            f"Couldn’t connect to [url][link=https://id.loc.gov]https://id.loc.gov[/][/]. Exception: {ex}"
                        )

                    i = i + 1

                metadata_xml = regex.sub(
                    r"\t\t<dc:subject id=\"subject-1\">SUBJECT_1</dc:subject>\s*<dc:subject id=\"subject-2\">SUBJECT_2</dc:subject>\s*<meta property=\"authority\" refines=\"#subject-1\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-1\">LCSH_ID_1</meta>\s*<meta property=\"authority\" refines=\"#subject-2\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-2\">LCSH_ID_2</meta>",
                    "\t\t" + subject_xhtml.strip(), metadata_xml)

            metadata_xml = metadata_xml.replace(
                "<dc:language>LANG</dc:language>",
                f"<dc:language>{pg_language}</dc:language>")
            metadata_xml = metadata_xml.replace(
                "<dc:source>PG_URL</dc:source>",
                f"<dc:source>{args.pg_url}</dc:source>")

        file.seek(0)
        file.write(metadata_xml)
        file.truncate()

    # Set up local git repo
    repo = git.Repo.init(repo_path)

    if args.email:
        with repo.config_writer() as config:
            config.set_value("user", "email", args.email)

    if args.pg_url and pg_ebook_html and not is_pg_html_parsed:
        raise se.InvalidXhtmlException(
            "Couldn’t parse Project Gutenberg ebook source. This is usually due to invalid HTML in the ebook."
        )
Exemplo n.º 8
0
def svg_text_to_paths(in_svg: Path, out_svg: Path, remove_style=True) -> None:
	"""
	Convert SVG <text> elements into <path> elements, using SVG
	document's <style> tag and external font files.
	(These SVG font files are built-in to the SE tools).
	Resulting SVG file will have no dependency on external fonts.

	INPUTS
	in_svg: Path for the SVG file to convert <text> elements.
	out_svg: Path for where to write the result SVG file, with <path> elements.

	OUTPUTS
	None.
	"""

	font_paths = []
	name_list = {"league_spartan": ["league-spartan-bold.svg"], "sorts_mill_goudy": ["sorts-mill-goudy-italic.svg", "sorts-mill-goudy.svg"]}
	for font_family, font_names in name_list.items():
		for font_name in font_names:
			with importlib_resources.path(f"se.data.fonts.{font_family}", font_name) as font_path:
				font_paths.append(font_path)
	fonts = []
	for font_path in font_paths:
		font = _parse_font(font_path)
		fonts.append(font)

	with open(in_svg, "rt") as svg_in_raw:
		try:
			xml = etree.fromstring(str.encode(svg_in_raw.read()))
		except Exception as ex:
			raise se.InvalidXmlException(f"Couldn’t parse SVG file: [path][link={in_svg.resolve()}]{in_svg}[/][/].") from ex

	svg_ns = "{http://www.w3.org/2000/svg}"

	style = xml.find(svg_ns + "style")

	# Possibly remove style tag if caller wants that
	def filter_predicate(elem: etree.Element):
		if remove_style and elem.tag.endswith("style"):
			return None # Remove <style> tag
		return elem # Keep all other elements
	if remove_style:
		xml = _traverse_element(xml, filter_predicate)

	for elem in xml.iter():
		if elem.tag.endswith("text"):
			properties = _apply_css(elem, style.text)
			_get_properties_from_text_elem(properties, elem)
			_add_font_to_properties(properties, fonts)
			text = elem.text

			if not text:
				raise se.InvalidFileException(f"SVG [xml]<text>[/] element has no content. File: [path][link=file://{in_svg.resolve()}]{in_svg}[/].")

			elem.tag = "g"
			# Replace <text> tag with <g> tag
			for k in elem.attrib.keys():
				if k != "class":
					del elem.attrib[k]
				elif k == "class" and elem.attrib["class"] != "title-box": # Keep just class attribute if class="title-box"
					del elem.attrib[k]
			elem.attrib["aria-label"] = text
			elem.tail = "\n"
			elem.text = ""
			_add_svg_paths_to_group(elem, properties)

	xmlstr = etree.tostring(xml, pretty_print=True).decode("UTF-8")
	result_all_text = xmlstr.replace("ns0:", "").replace(":ns0", "")
	result_all_text = se.formatting.format_xml(result_all_text)
	with open(out_svg, "wt") as output:
		output.write(result_all_text)
Exemplo n.º 9
0
def remove_image_metadata(filename: Path) -> None:
	"""
	Remove exif metadata from an image.

	INPUTS
	filename: A filename of an image

	OUTPUTS
	None.
	"""

	if filename.suffix == ".xcf" or filename.suffix == ".svg":
		# Skip GIMP XCF and SVG files
		return

	if filename.suffix == ".jpg":
		# JPEG images are lossy, and PIL will recompress them on save.
		# Instead of using PIL, read the byte stream and remove all metadata that way.
		# Inspired by https://github.com/hMatoba/Piexif
		with open(filename, "rb+") as file:
			jpeg_data = file.read()

			if jpeg_data[0:2] != b"\xff\xd8":
				raise se.InvalidFileException(f"Invalid JPEG file: [path][link=file://{filename.resolve()}]{filename}[/].")

			exif_segments = []
			head = 2

			# Get a list of metadata segments from the jpg
			while True:
				if jpeg_data[head: head + 2] == b"\xff\xda":
					break

				length = struct.unpack(">H", jpeg_data[head + 2: head + 4])[0]
				end_point = head + length + 2
				seg = jpeg_data[head: end_point]
				head = end_point

				if head >= len(jpeg_data):
					raise se.InvalidFileException(f"Invalid JPEG file: [path][link=file://{filename.resolve()}]{filename}[/].")

				# See https://www.disktuna.com/list-of-jpeg-markers/
				# and https://exiftool.org/TagNames/JPEG.html
				# These are the 15 "app" segments, EXCEPT app 14, as well as the "comment" segment.
				# This mirrors what exiftool does.
				metadata_segments = [b"\xff\xe1", b"\xff\xe2", b"\xff\xe3", b"\xff\xe4", b"\xff\xe5",
							b"\xff\xe6", b"\xff\xe7", b"\xff\xe8", b"\xff\xe9", b"\xff\xea",
							b"\xff\xeb", b"\xff\xec", b"\xff\xed", b"\xff\xef",
							b"\xff\xfe"]

				if seg[0:2] in metadata_segments:
					exif_segments.append(seg)

			# Now replace those segments with nothing
			for segment in exif_segments:
				jpeg_data = jpeg_data.replace(segment, b"")

			file.seek(0)
			file.write(jpeg_data)
			file.truncate()
	else:
		# PNG and other image types we expect are lossless so we can use PIL to remove metadata
		try:
			image = Image.open(filename)
		except UnidentifiedImageError as ex:
			raise se.InvalidFileException(f"Couldn’t identify image type of [path][link=file://{filename.resolve()}]{filename}[/].") from ex

		data = list(image.getdata())

		image_without_exif = Image.new(image.mode, image.size)
		image_without_exif.putdata(data)

		if image.format == "PNG":
			# Some metadata, like chromaticity and gamma, are useful to preserve in PNGs
			new_exif = PngImagePlugin.PngInfo()
			for key, value in image.info.items():
				if key.lower() == "gamma":
					new_exif.add(b"gAMA", struct.pack("!1I", int(value * 100000)))
				elif key.lower() == "chromaticity":
					new_exif.add(b"cHRM", struct.pack("!8I", \
							int(value[0] * 100000), \
							int(value[1] * 100000), \
							int(value[2] * 100000), \
							int(value[3] * 100000), \
							int(value[4] * 100000), \
							int(value[5] * 100000), \
							int(value[6] * 100000), \
							int(value[7] * 100000)))

			image_without_exif.save(filename, optimize=True, pnginfo=new_exif)
		elif image.format == "TIFF":
			# For some reason, when saving as TIFF we have to cast filename to str() otherwise
			# the save driver throws an exception
			image_without_exif.save(str(filename), compression="tiff_adobe_deflate")
		else:
			image_without_exif.save(str(filename))
Exemplo n.º 10
0
    def generate_endnotes(self) -> Tuple[int, int]:
        """
		Read the epub spine to regenerate all endnotes in order of appearance, starting from 1.
		Changes are written to disk.

		Returns a tuple of (found_endnote_count, changed_endnote_count)
		"""

        processed = 0
        current_note_number = 1
        notes_changed = 0
        change_list = []

        for file_name in self.get_content_files():
            if file_name in [
                    "titlepage.xhtml", "colophon.xhtml", "uncopyright.xhtml",
                    "imprint.xhtml", "halftitlepage.xhtml", "endnotes.xhtml"
            ]:
                continue

            processed += 1

            file_path = self.path / "src/epub/text" / file_name
            try:
                dom = self.get_dom(file_path)
            except Exception as ex:
                raise se.InvalidFileException(
                    f"Couldn’t open file: [path][link=file://{file_path}]{file_path}[/][/]."
                ) from ex

            needs_rewrite = False
            for link in dom.xpath(
                    "/html/body//a[contains(@epub:type, 'noteref')]"):
                old_anchor = ""
                href = link.get_attr("href") or ""
                if href:
                    # Extract just the anchor from a URL (ie, what follows a hash symbol)
                    hash_position = href.find(
                        "#") + 1  # we want the characters AFTER the hash
                    if hash_position > 0:
                        old_anchor = href[hash_position:]

                new_anchor = f"note-{current_note_number:d}"
                if new_anchor != old_anchor:
                    change_list.append(
                        f"Changed {old_anchor} to {new_anchor} in {file_name}")
                    notes_changed += 1
                    # Update the link in the dom
                    link.set_attr("href", f"endnotes.xhtml#{new_anchor}")
                    link.set_attr("id", f"noteref-{current_note_number:d}")
                    link.lxml_element.text = str(current_note_number)
                    needs_rewrite = True

                # Now try to find this in endnotes
                match_old = lambda x, old=old_anchor: x.anchor == old
                matches = list(filter(match_old, self.endnotes))
                if not matches:
                    raise se.InvalidInputException(
                        f"Couldn’t find endnote with anchor [attr]{old_anchor}[/]."
                    )
                if len(matches) > 1:
                    raise se.InvalidInputException(
                        f"Duplicate anchors in endnotes file for anchor [attr]{old_anchor}[/]."
                    )
                # Found a single match, which is what we want
                endnote = matches[0]
                endnote.number = current_note_number
                endnote.matched = True
                # We don't change the anchor or the back ref just yet
                endnote.source_file = file_name
                current_note_number += 1

            # If we need to write back the body text file
            if needs_rewrite:
                with open(file_path, "w") as file:
                    file.write(se.formatting.format_xhtml(dom.to_string()))

        if processed == 0:
            raise se.InvalidInputException(
                "No files processed. Did you update the manifest and order the spine?"
            )

        if notes_changed > 0:
            # Now we need to recreate the endnotes file
            endnotes_dom = self.get_dom(self.path / "src" / "epub" / "text" /
                                        "endnotes.xhtml")
            for ol_node in endnotes_dom.xpath(
                    "/html/body/section[contains(@epub:type, 'endnotes')]/ol[1]"
            ):
                for node in ol_node.xpath(
                        "./li[contains(@epub:type, 'endnote')]"):
                    node.remove()

                self.endnotes.sort(key=lambda endnote: endnote.number)

                for endnote in self.endnotes:
                    if endnote.matched:
                        endnote.node.set_attr("id", f"note-{endnote.number}")

                        for node in endnote.node.xpath(
                                ".//a[contains(@epub:type, 'backlink')]"):
                            node.set_attr(
                                "href",
                                f"{endnote.source_file}#noteref-{endnote.number}"
                            )

                        ol_node.append(endnote.node)

            with open(self.path / "src" / "epub" / "text" / "endnotes.xhtml",
                      "w") as file:
                file.write(se.formatting.format_xhtml(
                    endnotes_dom.to_string()))

        return (current_note_number - 1, notes_changed)
Exemplo n.º 11
0
def _create_draft(args: Namespace):
    """
	Implementation for `se create-draft`
	"""

    # Put together some variables for later use
    identifier = se.formatting.make_url_safe(
        args.author) + "/" + se.formatting.make_url_safe(args.title)
    title_string = args.title.replace(
        "'", "’") + ", by " + args.author.replace("'", "’")
    sorted_title = regex.sub(r"^(A|An|The) (.+)$", "\\2, \\1", args.title)
    pg_producers = []

    if args.translator:
        identifier = identifier + "/" + se.formatting.make_url_safe(
            args.translator)
        title_string = title_string + ". Translated by " + args.translator

    if args.illustrator:
        identifier = identifier + "/" + se.formatting.make_url_safe(
            args.illustrator)
        title_string = title_string + ". Illustrated by " + args.illustrator

    repo_name = identifier.replace("/", "_")

    repo_path = Path(repo_name).resolve()

    if repo_path.is_dir():
        raise se.InvalidInputException(
            f"Directory already exists: [path][link=file://{repo_path}]{repo_path}[/][/]."
        )

    # Download PG HTML and do some fixups
    if args.pg_url:
        if args.offline:
            raise se.RemoteCommandErrorException(
                "Cannot download Project Gutenberg ebook when offline option is enabled."
            )

        args.pg_url = args.pg_url.replace("http://", "https://")

        # Get the ebook metadata
        try:
            response = requests.get(args.pg_url)
            pg_metadata_html = response.text
        except Exception as ex:
            raise se.RemoteCommandErrorException(
                f"Couldn’t download Project Gutenberg ebook metadata page. Exception: {ex}"
            )

        soup = BeautifulSoup(pg_metadata_html, "lxml")

        # Get the ebook HTML URL from the metadata
        pg_ebook_url = None
        for element in soup.select("a[type^=\"text/html\"]"):
            pg_ebook_url = regex.sub(r"^//", "https://", element["href"])
            pg_ebook_url = regex.sub(r"^/", "https://www.gutenberg.org/",
                                     pg_ebook_url)

        if not pg_ebook_url:
            raise se.RemoteCommandErrorException(
                "Could download ebook metadata, but couldn’t find URL for the ebook HTML."
            )

        # Get the ebook LCSH categories
        pg_subjects = []
        for element in soup.select("td[property=\"dcterms:subject\"]"):
            if element["datatype"] == "dcterms:LCSH":
                for subject_link in element.find("a"):
                    pg_subjects.append(subject_link.strip())

        # Get the PG publication date
        pg_publication_year = None
        for element in soup.select("td[itemprop=\"datePublished\"]"):
            pg_publication_year = regex.sub(r".+?([0-9]{4})", "\\1",
                                            element.text)

        # Get the actual ebook URL
        try:
            response = requests.get(pg_ebook_url)
            pg_ebook_html = response.text
        except Exception as ex:
            raise se.RemoteCommandErrorException(
                f"Couldn’t download Project Gutenberg ebook HTML. Exception: {ex}"
            )

        try:
            fixed_pg_ebook_html = fix_text(pg_ebook_html, uncurl_quotes=False)
            pg_ebook_html = se.strip_bom(fixed_pg_ebook_html)
        except Exception as ex:
            raise se.InvalidEncodingException(
                f"Couldn’t determine text encoding of Project Gutenberg HTML file. Exception: {ex}"
            )

        # Try to guess the ebook language
        pg_language = "en-US"
        if "colour" in pg_ebook_html or "favour" in pg_ebook_html or "honour" in pg_ebook_html:
            pg_language = "en-GB"

    # Create necessary directories
    (repo_path / "images").mkdir(parents=True)
    (repo_path / "src" / "epub" / "css").mkdir(parents=True)
    (repo_path / "src" / "epub" / "images").mkdir(parents=True)
    (repo_path / "src" / "epub" / "text").mkdir(parents=True)
    (repo_path / "src" / "META-INF").mkdir(parents=True)

    is_pg_html_parsed = True

    # Write PG data if we have it
    if args.pg_url and pg_ebook_html:
        try:
            soup = BeautifulSoup(pg_ebook_html, "html.parser")

            # Try to get the PG producers.  We only try this if there's a <pre> block with the header info (which is not always the case)
            for element in soup(text=regex.compile(r"\*\*\*\s*Produced by.+$",
                                                   flags=regex.DOTALL)):
                if element.parent.name == "pre":
                    producers_text = regex.sub(r".+?Produced by (.+?)\s*$",
                                               "\\1",
                                               element,
                                               flags=regex.DOTALL)
                    producers_text = regex.sub(r"\(.+?\)",
                                               "",
                                               producers_text,
                                               flags=regex.DOTALL)
                    producers_text = regex.sub(
                        r"(at )?https?://www\.pgdp\.net",
                        "",
                        producers_text,
                        flags=regex.DOTALL)
                    producers_text = regex.sub(r"[\r\n]+",
                                               " ",
                                               producers_text,
                                               flags=regex.DOTALL)
                    producers_text = regex.sub(r",? and ", ", and ",
                                               producers_text)
                    producers_text = producers_text.replace(
                        " and the Online", " and The Online")
                    producers_text = producers_text.replace(", and ",
                                                            ", ").strip()

                    pg_producers = producers_text.split(", ")

            # Try to strip out the PG header
            for element in soup(text=regex.compile(r"\*\*\*\s*START OF THIS")):
                for sibling in element.parent.find_previous_siblings():
                    sibling.decompose()

                element.parent.decompose()

            # Try to strip out the PG license footer
            for element in soup(
                    text=regex.compile(r"End of (the )?Project Gutenberg")):
                for sibling in element.parent.find_next_siblings():
                    sibling.decompose()

                element.parent.decompose()

            with open(repo_path / "src" / "epub" / "text" / "body.xhtml",
                      "w",
                      encoding="utf-8") as file:
                file.write(str(soup))
        except OSError as ex:
            raise se.InvalidFileException(
                f"Couldn’t write to ebook directory. Exception: {ex}")
        except:
            # Save this error for later, because it's still useful to complete the create-draft process
            # even if we've failed to parse PG's HTML source.
            is_pg_html_parsed = False
            se.quiet_remove(repo_path / "src" / "epub" / "text" / "body.xhtml")

    # Copy over templates

    _copy_template_file("gitignore", repo_path / ".gitignore")
    _copy_template_file("LICENSE.md", repo_path)
    _copy_template_file("container.xml", repo_path / "src" / "META-INF")
    _copy_template_file("mimetype", repo_path / "src")
    _copy_template_file("content.opf", repo_path / "src" / "epub")
    _copy_template_file("onix.xml", repo_path / "src" / "epub")
    _copy_template_file("toc.xhtml", repo_path / "src" / "epub")
    _copy_template_file("core.css", repo_path / "src" / "epub" / "css")
    _copy_template_file("local.css", repo_path / "src" / "epub" / "css")
    _copy_template_file("logo.svg", repo_path / "src" / "epub" / "images")
    _copy_template_file("colophon.xhtml", repo_path / "src" / "epub" / "text")
    _copy_template_file("imprint.xhtml", repo_path / "src" / "epub" / "text")
    _copy_template_file("titlepage.xhtml", repo_path / "src" / "epub" / "text")
    _copy_template_file("uncopyright.xhtml",
                        repo_path / "src" / "epub" / "text")
    _copy_template_file("titlepage.svg", repo_path / "images")
    _copy_template_file("cover.jpg", repo_path / "images" / "cover.jpg")
    _copy_template_file("cover.svg", repo_path / "images" / "cover.svg")

    # Try to find Wikipedia links if possible
    if args.offline:
        author_wiki_url = None
        author_nacoaf_url = None
        ebook_wiki_url = None
        translator_wiki_url = None
        translator_nacoaf_url = None
    else:
        author_wiki_url, author_nacoaf_url = _get_wikipedia_url(
            args.author, True)
        ebook_wiki_url = None
        if args.title != "Short Fiction":
            # There's a "Short Fiction" Wikipedia article, so make an exception for that case
            ebook_wiki_url, _ = _get_wikipedia_url(args.title, False)
        translator_wiki_url = None
        if args.translator:
            translator_wiki_url, translator_nacoaf_url = _get_wikipedia_url(
                args.translator, True)

    # Pre-fill a few templates
    _replace_in_file(repo_path / "src" / "epub" / "text" / "titlepage.xhtml",
                     "TITLE_STRING", title_string)
    _replace_in_file(repo_path / "images" / "titlepage.svg", "TITLE_STRING",
                     title_string)
    _replace_in_file(repo_path / "images" / "cover.svg", "TITLE_STRING",
                     title_string)

    # Create the titlepage SVG
    contributors = {}
    if args.translator:
        contributors["translated by"] = args.translator

    if args.illustrator:
        contributors["illustrated by"] = args.illustrator

    with open(repo_path / "images" / "titlepage.svg", "w",
              encoding="utf-8") as file:
        file.write(
            _generate_titlepage_svg(args.title, args.author, contributors,
                                    title_string))

    # Create the cover SVG
    with open(repo_path / "images" / "cover.svg", "w",
              encoding="utf-8") as file:
        file.write(_generate_cover_svg(args.title, args.author, title_string))

    # Build the cover/titlepage for distribution
    epub = SeEpub(repo_path)
    epub.generate_cover_svg()
    epub.generate_titlepage_svg()

    if args.pg_url:
        _replace_in_file(repo_path / "src" / "epub" / "text" / "imprint.xhtml",
                         "PG_URL", args.pg_url)

    with open(repo_path / "src" / "epub" / "text" / "colophon.xhtml",
              "r+",
              encoding="utf-8") as file:
        colophon_xhtml = file.read()

        colophon_xhtml = colophon_xhtml.replace("SE_IDENTIFIER", identifier)
        colophon_xhtml = colophon_xhtml.replace(">AUTHOR<", f">{args.author}<")
        colophon_xhtml = colophon_xhtml.replace("TITLE", args.title)

        if author_wiki_url:
            colophon_xhtml = colophon_xhtml.replace("AUTHOR_WIKI_URL",
                                                    author_wiki_url)

        if args.pg_url:
            colophon_xhtml = colophon_xhtml.replace("PG_URL", args.pg_url)

            if pg_publication_year:
                colophon_xhtml = colophon_xhtml.replace(
                    "PG_YEAR", pg_publication_year)

            if pg_producers:
                producers_xhtml = ""
                for i, producer in enumerate(pg_producers):
                    if "Distributed Proofread" in producer:
                        producers_xhtml = producers_xhtml + "<a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a>"
                    elif "anonymous" in producer.lower():
                        producers_xhtml = producers_xhtml + "<b class=\"name\">An Anonymous Volunteer</b>"
                    else:
                        producers_xhtml = producers_xhtml + f"<b class=\"name\">{producer.strip('.')}</b>"

                    if i < len(pg_producers) - 1:
                        producers_xhtml = producers_xhtml + ", "

                    if i == len(pg_producers) - 2:
                        producers_xhtml = producers_xhtml + "and "

                producers_xhtml = producers_xhtml + "<br/>"

                colophon_xhtml = colophon_xhtml.replace(
                    "<b class=\"name\">TRANSCRIBER_1</b>, <b class=\"name\">TRANSCRIBER_2</b>, and <a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a><br/>",
                    producers_xhtml)

        file.seek(0)
        file.write(colophon_xhtml)
        file.truncate()

    with open(repo_path / "src" / "epub" / "content.opf",
              "r+",
              encoding="utf-8") as file:
        metadata_xml = file.read()

        metadata_xml = metadata_xml.replace("SE_IDENTIFIER", identifier)
        metadata_xml = metadata_xml.replace(">AUTHOR<", f">{args.author}<")
        metadata_xml = metadata_xml.replace(">TITLE_SORT<",
                                            f">{sorted_title}<")
        metadata_xml = metadata_xml.replace(">TITLE<", f">{args.title}<")
        metadata_xml = metadata_xml.replace("VCS_IDENTIFIER", str(repo_name))

        if pg_producers:
            producers_xhtml = ""
            i = 1
            for producer in pg_producers:
                if "Distributed Proofread" in producer:
                    producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">The Online Distributed Proofreading Team</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Online Distributed Proofreading Team, The</meta>\n\t\t<meta property=\"se:url.homepage\" refines=\"#transcriber-{i}\">https://pgdp.net</meta>\n"
                elif "anonymous" in producer.lower():
                    producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">An Anonymous Volunteer</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Anonymous Volunteer, An</meta>\n"
                else:
                    producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">{producer.strip('.')}</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">TRANSCRIBER_SORT</meta>\n"

                producers_xhtml = producers_xhtml + f"\t\t<meta property=\"role\" refines=\"#transcriber-{i}\" scheme=\"marc:relators\">trc</meta>\n"

                i = i + 1

            metadata_xml = regex.sub(
                r"\t\t<dc:contributor id=\"transcriber-1\">TRANSCRIBER</dc:contributor>\s*<meta property=\"file-as\" refines=\"#transcriber-1\">TRANSCRIBER_SORT</meta>\s*<meta property=\"se:url.homepage\" refines=\"#transcriber-1\">TRANSCRIBER_URL</meta>\s*<meta property=\"role\" refines=\"#transcriber-1\" scheme=\"marc:relators\">trc</meta>",
                "\t\t" + producers_xhtml.strip(),
                metadata_xml,
                flags=regex.DOTALL)

        if author_wiki_url:
            metadata_xml = metadata_xml.replace(">AUTHOR_WIKI_URL<",
                                                f">{author_wiki_url}<")

        if author_nacoaf_url:
            metadata_xml = metadata_xml.replace(">AUTHOR_NACOAF_URL<",
                                                f">{author_nacoaf_url}<")

        if ebook_wiki_url:
            metadata_xml = metadata_xml.replace(">EBOOK_WIKI_URL<",
                                                f">{ebook_wiki_url}<")

        if args.translator:
            metadata_xml = metadata_xml.replace(">TRANSLATOR<",
                                                f">{args.translator}<")

            if translator_wiki_url:
                metadata_xml = metadata_xml.replace(
                    ">TRANSLATOR_WIKI_URL<", f">{translator_wiki_url}<")

            if translator_nacoaf_url:
                metadata_xml = metadata_xml.replace(
                    ">TRANSLATOR_NACOAF_URL<", f">{translator_nacoaf_url}<")
        else:
            metadata_xml = regex.sub(
                r"<dc:contributor id=\"translator\">.+?<dc:contributor id=\"artist\">",
                "<dc:contributor id=\"artist\">",
                metadata_xml,
                flags=regex.DOTALL)

        if args.pg_url:
            if pg_subjects:
                subject_xhtml = ""

                i = 1
                for subject in pg_subjects:
                    subject_xhtml = subject_xhtml + f"\t\t<dc:subject id=\"subject-{i}\">{subject}</dc:subject>\n"
                    i = i + 1

                i = 1
                for subject in pg_subjects:
                    subject_xhtml = subject_xhtml + f"\t\t<meta property=\"authority\" refines=\"#subject-{i}\">LCSH</meta>\n"

                    # Now, get the LCSH ID by querying LCSH directly.
                    try:
                        response = requests.get(
                            f"https://id.loc.gov/search/?q=%22{urllib.parse.quote(subject)}%22"
                        )
                        result = regex.search(
                            fr"<a title=\"Click to view record\" href=\"/authorities/subjects/([^\"]+?)\">{regex.escape(subject.replace(' -- ', '--'))}</a>",
                            response.text)

                        loc_id = "Unknown"
                        try:
                            loc_id = result.group(1)
                        except Exception as ex:
                            pass

                        subject_xhtml = subject_xhtml + f"\t\t<meta property=\"term\" refines=\"#subject-{i}\">{loc_id}</meta>\n"

                    except Exception as ex:
                        raise se.RemoteCommandErrorException(
                            f"Couldn’t connect to [url][link=https://id.loc.gov]https://id.loc.gov[/][/]. Exception: {ex}"
                        )

                    i = i + 1

                metadata_xml = regex.sub(
                    r"\t\t<dc:subject id=\"subject-1\">SUBJECT_1</dc:subject>\s*<dc:subject id=\"subject-2\">SUBJECT_2</dc:subject>\s*<meta property=\"authority\" refines=\"#subject-1\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-1\">LCSH_ID_1</meta>\s*<meta property=\"authority\" refines=\"#subject-2\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-2\">LCSH_ID_2</meta>",
                    "\t\t" + subject_xhtml.strip(), metadata_xml)

            metadata_xml = metadata_xml.replace(
                "<dc:language>LANG</dc:language>",
                f"<dc:language>{pg_language}</dc:language>")
            metadata_xml = metadata_xml.replace(
                "<dc:source>PG_URL</dc:source>",
                f"<dc:source>{args.pg_url}</dc:source>")

        file.seek(0)
        file.write(metadata_xml)
        file.truncate()

    # Set up local git repo
    repo = git.Repo.init(repo_path)

    if args.email:
        with repo.config_writer() as config:
            config.set_value("user", "email", args.email)

    if args.pg_url and pg_ebook_html and not is_pg_html_parsed:
        raise se.InvalidXhtmlException(
            "Couldn’t parse Project Gutenberg ebook source. This is usually due to invalid HTML in the ebook."
        )
def process_all_content(file_list: list) -> Tuple[list, list]:
    """
	Analyze the whole content of the project, build and return lists
	if toc_items and landmarks.

	INPUTS:
	file_list: a list of all content files
	text_path: the path to the contents folder (src/epub/text)

	OUTPUTS:
	a tuple containing the list of Toc items and the list of landmark items
	"""

    toc_list: List[TocItem] = []
    landmarks: List[TocItem] = []

    # We make two passes through the work, because we need to know
    # how many bodymatter items there are. So we do landmarks first.
    for textf in file_list:
        try:
            with open(textf, encoding="utf-8") as file:
                dom = se.easy_xml.EasyXmlTree(file.read())
        except Exception as ex:
            raise se.InvalidFileException(
                f"Couldn’t open file: [path][link=file://{textf}]{textf}[/][/]. Exception: {ex}"
            ) from ex

        add_landmark(dom, textf.name, landmarks)

    # Now we test to see if there is only one body item
    body_items = [item for item in landmarks if item.place == Position.BODY]
    single_file = (len(body_items) == 1)

    nest_under_halftitle = False

    for textf in file_list:
        with open(textf, "r", encoding="utf-8") as file:
            dom = se.easy_xml.EasyXmlTree(file.read())
        process_headings(dom, textf.name, toc_list, nest_under_halftitle,
                         single_file)
        if dom.xpath("/html/body//*[contains(@epub:type, 'halftitlepage')]"):
            nest_under_halftitle = True

    # now go through adjusting for nesting under halftitle
    if nest_under_halftitle:
        # tricky because a few books have forewords, etc AFTER the halftitle, so have to know if we've passed it
        passed_halftitle = False
        for toc_item in toc_list:
            if toc_item.place == Position.BODY:
                toc_item.level += 1
            if passed_halftitle and toc_item.place == Position.FRONT:
                toc_item.level += 1
            if "halftitle" in toc_item.file_link:
                passed_halftitle = True

    # We add this dummy item because outputtoc always needs to look ahead to the next item.
    last_toc = TocItem()
    last_toc.level = 1
    last_toc.title = "dummy"
    toc_list.append(last_toc)

    return landmarks, toc_list
Exemplo n.º 13
0
    def generate_endnotes(self) -> str:
        """
		The generate_endnotes() function is very big so for readability and maintainability
		it's broken out to a separate file. Strictly speaking that file can be inlined
		into this class.
		"""

        processed = 0
        report = ""
        current_note_number = 1
        notes_changed = 0
        change_list = []

        for file_name in self.get_content_files():
            if file_name in [
                    "titlepage.xhtml", "colophon.xhtml", "uncopyright.xhtml",
                    "imprint.xhtml", "halftitle.xhtml", "endnotes.xhtml"
            ]:
                continue

            processed += 1

            file_path = self.path / "src" / "epub" / "text" / file_name
            try:
                with open(file_path) as file:
                    soup = BeautifulSoup(file.read(), "lxml")
            except:
                raise se.InvalidFileException("Could't open file: {}".format(
                    str(file_path)))

            links = soup.find_all("a")
            needs_rewrite = False
            for link in links:
                epub_type = link.get("epub:type") or ""
                if epub_type == "noteref":
                    old_anchor = ""
                    href = link.get("href") or ""
                    if href:
                        # Extract just the anchor from a URL (ie, what follows a hash symbol)
                        old_anchor = ""

                        hash_position = href.find(
                            "#") + 1  # we want the characters AFTER the hash
                        if hash_position > 0:
                            old_anchor = href[hash_position:]

                    new_anchor = "note-{:d}".format(current_note_number)
                    if new_anchor != old_anchor:
                        change_list.append("Changed " + old_anchor + " to " +
                                           new_anchor + " in " + file_name)
                        notes_changed += 1
                        # Update the link in the soup object
                        link["href"] = 'endnotes.xhtml#' + new_anchor
                        link["id"] = 'noteref-{:d}'.format(current_note_number)
                        link.string = str(current_note_number)
                        needs_rewrite = True
                    # Now try to find this in endnotes
                    matches = list(
                        filter(lambda x, old=old_anchor: x.anchor == old,
                               self.endnotes))
                    if not matches:
                        raise se.InvalidInputException(
                            "Couldn't find endnote with anchor " + old_anchor)
                    if len(matches) > 1:
                        raise se.InvalidInputException(
                            "Duplicate anchors in endnotes file for anchor " +
                            old_anchor)
                    # Found a single match, which is what we want
                    endnote = matches[0]
                    endnote.number = current_note_number
                    endnote.matched = True
                    # We don't change the anchor or the back ref just yet
                    endnote.source_file = file_name
                    current_note_number += 1

            # If we need to write back the body text file
            if needs_rewrite:
                new_file = open(file_path, "w")
                new_file.write(se.formatting.format_xhtml(str(soup)))
                new_file.close()

        if processed == 0:
            report += "No files processed. Did you update the manifest and order the spine?" + "\n"
        else:
            report += "Found {:d} endnotes.".format(current_note_number -
                                                    1) + "\n"
            if notes_changed > 0:
                # Now we need to recreate the endnotes file
                ol_tag = self._endnotes_soup.ol
                ol_tag.clear()
                for endnote in self.endnotes:
                    if endnote.matched:
                        li_tag = self._endnotes_soup.new_tag("li")
                        li_tag["id"] = "note-" + str(endnote.number)
                        li_tag["epub:type"] = "endnote"
                        for content in endnote.contents:
                            if isinstance(content, Tag):
                                links = content.find_all("a")
                                for link in links:
                                    epub_type = link.get("epub:type") or ""
                                    if epub_type == "se:referrer":
                                        href = link.get("href") or ""
                                        if href:
                                            link[
                                                "href"] = endnote.source_file + "#noteref-" + str(
                                                    endnote.number)
                            li_tag.append(content)
                        ol_tag.append(li_tag)

                with open(
                        self.path / "src" / "epub" / "text" / "endnotes.xhtml",
                        "w") as file:
                    file.write(
                        se.formatting.format_xhtml(str(self._endnotes_soup),
                                                   is_endnotes_file=True))

                report += "Changed {:d} endnote{}.".format(
                    notes_changed, "s" if notes_changed != 1 else "")
            else:
                report += "No changes made."
        return report
Exemplo n.º 14
0
	def generate_endnotes(self) -> Tuple[int, int]:
		"""
		Read the epub spine to regenerate all endnotes in order of appearance, starting from 1.
		Changes are written to disk.

		Returns a tuple of (found_endnote_count, changed_endnote_count)
		"""

		processed = 0
		current_note_number = 1
		notes_changed = 0
		change_list = []

		for file_name in self.get_content_files():
			if file_name in ["titlepage.xhtml", "colophon.xhtml", "uncopyright.xhtml", "imprint.xhtml", "halftitle.xhtml", "endnotes.xhtml"]:
				continue

			processed += 1

			file_path = self.path / "src/epub/text" / file_name
			try:
				with open(file_path) as file:
					soup = BeautifulSoup(file.read(), "lxml")
			except:
				raise se.InvalidFileException(f"Couldn’t open file: [path][link=file://{file_path}]{file_path}[/][/].")

			links = soup.find_all("a")
			needs_rewrite = False
			for link in links:
				epub_type = link.get("epub:type") or ""
				if epub_type == "noteref":
					old_anchor = ""
					href = link.get("href") or ""
					if href:
						# Extract just the anchor from a URL (ie, what follows a hash symbol)
						old_anchor = ""

						hash_position = href.find("#") + 1  # we want the characters AFTER the hash
						if hash_position > 0:
							old_anchor = href[hash_position:]

					new_anchor = f"note-{current_note_number:d}"
					if new_anchor != old_anchor:
						change_list.append(f"Changed {old_anchor} to {new_anchor} in {file_name}")
						notes_changed += 1
						# Update the link in the soup object
						link["href"] = 'endnotes.xhtml#' + new_anchor
						link["id"] = f'noteref-{current_note_number:d}'
						link.string = str(current_note_number)
						needs_rewrite = True
					# Now try to find this in endnotes
					match_old = lambda x, old=old_anchor: x.anchor == old
					matches = list(filter(match_old, self.endnotes))
					if not matches:
						raise se.InvalidInputException(f"Couldn’t find endnote with anchor [attr]{old_anchor}[/].")
					if len(matches) > 1:
						raise se.InvalidInputException(f"Duplicate anchors in endnotes file for anchor [attr]{old_anchor}[/].")
					# Found a single match, which is what we want
					endnote = matches[0]
					endnote.number = current_note_number
					endnote.matched = True
					# We don't change the anchor or the back ref just yet
					endnote.source_file = file_name
					current_note_number += 1

			# If we need to write back the body text file
			if needs_rewrite:
				new_file = open(file_path, "w")
				new_file.write(se.formatting.format_xhtml(str(soup)))
				new_file.close()

		if processed == 0:
			raise se.InvalidInputException("No files processed. Did you update the manifest and order the spine?")

		if notes_changed > 0:
			# Now we need to recreate the endnotes file
			ol_tag = self._endnotes_soup.ol
			ol_tag.clear()

			self.endnotes.sort(key=lambda endnote: endnote.number)

			for endnote in self.endnotes:
				if endnote.matched:
					li_tag = self._endnotes_soup.new_tag("li")
					li_tag["id"] = "note-" + str(endnote.number)
					li_tag["epub:type"] = "endnote"
					for content in endnote.contents:
						if isinstance(content, Tag):
							links = content.find_all("a")
							for link in links:
								epub_type = link.get("epub:type") or ""
								if epub_type == "backlink":
									href = link.get("href") or ""
									if href:
										link["href"] = endnote.source_file + "#noteref-" + str(endnote.number)
						li_tag.append(content)
					ol_tag.append(li_tag)

			with open(self.path / "src" / "epub" / "text" / "endnotes.xhtml", "w") as file:
				file.write(se.formatting.format_xhtml(str(self._endnotes_soup)))

		return (current_note_number - 1, notes_changed)
Exemplo n.º 15
0
def create_draft(args: list):
    """
	Entry point for `se create-draft`
	"""

    # Put together some variables for later use
    identifier = se.formatting.make_url_safe(
        args.author) + "/" + se.formatting.make_url_safe(args.title)
    title_string = args.title.replace(
        "'", "’") + ", by " + args.author.replace("'", "’")
    sorted_title = regex.sub(r"^(A|An|The) (.+)$", "\\2, \\1", args.title)
    pg_producers = []

    if args.translator:
        identifier = identifier + "/" + se.formatting.make_url_safe(
            args.translator)
        title_string = title_string + ". Translated by " + args.translator

    if args.illustrator:
        identifier = identifier + "/" + se.formatting.make_url_safe(
            args.illustrator)
        title_string = title_string + ". Illustrated by " + args.illustrator

    repo_name = Path(identifier.replace("/", "_"))

    if repo_name.is_dir():
        raise se.InvalidInputException(
            "./{}/ already exists.".format(repo_name))

    # Download PG HTML and do some fixups
    if args.pg_url:
        args.pg_url = args.pg_url.replace("http://", "https://")

        # Get the ebook metadata
        try:
            response = requests.get(args.pg_url)
            pg_metadata_html = response.text
        except Exception as ex:
            raise se.RemoteCommandErrorException(
                "Couldn’t download Project Gutenberg ebook metadata page. Error: {}"
                .format(ex))

        soup = BeautifulSoup(pg_metadata_html, "lxml")

        # Get the ebook HTML URL from the metadata
        pg_ebook_url = None
        for element in soup.select("a[type^=\"text/html\"]"):
            pg_ebook_url = regex.sub(r"^//", "https://", element["href"])
            pg_ebook_url = regex.sub(r"^/", "https://www.gutenberg.org/",
                                     pg_ebook_url)

        if not pg_ebook_url:
            raise se.RemoteCommandErrorException(
                "Could download ebook metadata, but couldn’t find URL for the ebook HTML."
            )

        # Get the ebook LCSH categories
        pg_subjects = []
        for element in soup.select("td[property=\"dcterms:subject\"]"):
            if element["datatype"] == "dcterms:LCSH":
                for subject_link in element.find("a"):
                    pg_subjects.append(subject_link.strip())

        # Get the PG publication date
        pg_publication_year = None
        for element in soup.select("td[itemprop=\"datePublished\"]"):
            pg_publication_year = regex.sub(r".+?([0-9]{4})", "\\1",
                                            element.text)

        # Get the actual ebook URL
        try:
            response = requests.get(pg_ebook_url)
            pg_ebook_html = response.text
        except Exception as ex:
            raise se.RemoteCommandErrorException(
                "Couldn’t download Project Gutenberg ebook HTML. Error: {}".
                format(ex))

        try:
            fixed_pg_ebook_html = fix_text(pg_ebook_html, uncurl_quotes=False)
            pg_ebook_html = se.strip_bom(fixed_pg_ebook_html)
        except Exception as ex:
            raise se.InvalidEncodingException(
                "Couldn’t determine text encoding of Project Gutenberg HTML file. Error: {}"
                .format(ex))

        # Try to guess the ebook language
        pg_language = "en-US"
        if "colour" in pg_ebook_html or "favour" in pg_ebook_html or "honour" in pg_ebook_html:
            pg_language = "en-GB"

    # Create necessary directories
    (repo_name / "images").mkdir(parents=True)
    (repo_name / "src" / "epub" / "css").mkdir(parents=True)
    (repo_name / "src" / "epub" / "images").mkdir(parents=True)
    (repo_name / "src" / "epub" / "text").mkdir(parents=True)
    (repo_name / "src" / "META-INF").mkdir(parents=True)

    # Write PG data if we have it
    if args.pg_url and pg_ebook_html:
        try:
            soup = BeautifulSoup(pg_ebook_html, "html.parser")

            # Try to get the PG producers.  We only try this if there's a <pre> block with the header info (which is not always the case)
            for element in soup(text=regex.compile(r"\*\*\*\s*Produced by.+$",
                                                   flags=regex.DOTALL)):
                if element.parent.name == "pre":
                    pg_producers = regex.sub(r".+?Produced by (.+?)\s*$",
                                             "\\1",
                                             element,
                                             flags=regex.DOTALL)
                    pg_producers = regex.sub(r"\(.+?\)",
                                             "",
                                             pg_producers,
                                             flags=regex.DOTALL)
                    pg_producers = regex.sub(r"(at )?https?://www\.pgdp\.net",
                                             "",
                                             pg_producers,
                                             flags=regex.DOTALL)
                    pg_producers = regex.sub(r"[\r\n]+",
                                             " ",
                                             pg_producers,
                                             flags=regex.DOTALL)
                    pg_producers = regex.sub(r",? and ", ", and ",
                                             pg_producers)
                    pg_producers = pg_producers.replace(
                        " and the Online", " and The Online")
                    pg_producers = pg_producers.replace(
                        ", and ", ", ").strip().split(", ")

            # Try to strip out the PG header
            for element in soup(text=regex.compile(r"\*\*\*\s*START OF THIS")):
                for sibling in element.parent.find_previous_siblings():
                    sibling.decompose()

                element.parent.decompose()

            # Try to strip out the PG license footer
            for element in soup(
                    text=regex.compile(r"End of (the )?Project Gutenberg")):
                for sibling in element.parent.find_next_siblings():
                    sibling.decompose()

                element.parent.decompose()

            with open(repo_name / "src" / "epub" / "text" / "body.xhtml",
                      "w",
                      encoding="utf-8") as file:
                file.write(str(soup))
        except IOError as ex:
            raise se.InvalidFileException(
                "Couldn’t write to ebook directory. Error: {}".format(ex))
        except:
            raise se.InvalidInputException(
                "Couldn’t parse Project Gutenberg ebook source. This is usually due to invalid HTML in the ebook."
            )

    # Copy over templates
    shutil.copy(
        resource_filename("se", str(Path("data") / "templates" / "gitignore")),
        repo_name / ".gitignore")
    shutil.copy(
        resource_filename("se",
                          str(Path("data") / "templates" / "LICENSE.md")),
        repo_name)
    shutil.copy(
        resource_filename(
            "se",
            str(Path("data") / "templates" / "META-INF" / "container.xml")),
        repo_name / "src" / "META-INF")
    shutil.copy(
        resource_filename("se", str(Path("data") / "templates" / "mimetype")),
        repo_name / "src")
    shutil.copy(
        resource_filename("se",
                          str(Path("data") / "templates" / "content.opf")),
        repo_name / "src" / "epub")
    shutil.copy(
        resource_filename("se", str(Path("data") / "templates" / "onix.xml")),
        repo_name / "src" / "epub")
    shutil.copy(
        resource_filename("se", str(Path("data") / "templates" / "toc.xhtml")),
        repo_name / "src" / "epub")
    shutil.copy(
        resource_filename("se", str(Path("data") / "templates" / "core.css")),
        repo_name / "src" / "epub" / "css")
    shutil.copy(
        resource_filename("se", str(Path("data") / "templates" / "local.css")),
        repo_name / "src" / "epub" / "css")
    shutil.copy(
        resource_filename("se", str(Path("data") / "templates" / "logo.svg")),
        repo_name / "src" / "epub" / "images")
    shutil.copy(
        resource_filename("se",
                          str(Path("data") / "templates" / "colophon.xhtml")),
        repo_name / "src" / "epub" / "text")
    shutil.copy(
        resource_filename("se",
                          str(Path("data") / "templates" / "imprint.xhtml")),
        repo_name / "src" / "epub" / "text")
    shutil.copy(
        resource_filename("se",
                          str(Path("data") / "templates" / "titlepage.xhtml")),
        repo_name / "src" / "epub" / "text")
    shutil.copy(
        resource_filename(
            "se", str(Path("data") / "templates" / "uncopyright.xhtml")),
        repo_name / "src" / "epub" / "text")
    shutil.copy(
        resource_filename("se",
                          str(Path("data") / "templates" / "titlepage.svg")),
        repo_name / "images")
    shutil.copy(
        resource_filename("se", str(Path("data") / "templates" / "cover.jpg")),
        repo_name / "images" / "cover.jpg")
    shutil.copy(
        resource_filename("se", str(Path("data") / "templates" / "cover.svg")),
        repo_name / "images" / "cover.svg")

    # Try to find Wikipedia links if possible
    author_wiki_url, author_nacoaf_url = _get_wikipedia_url(args.author, True)
    ebook_wiki_url, _ = _get_wikipedia_url(args.title, False)
    translator_wiki_url = None
    if args.translator:
        translator_wiki_url, translator_nacoaf_url = _get_wikipedia_url(
            args.translator, True)

    # Pre-fill a few templates
    se.replace_in_file(repo_name / "src" / "epub" / "text" / "titlepage.xhtml",
                       "TITLE_STRING", title_string)
    se.replace_in_file(repo_name / "images" / "titlepage.svg", "TITLE_STRING",
                       title_string)
    se.replace_in_file(repo_name / "images" / "cover.svg", "TITLE_STRING",
                       title_string)

    # Create the titlepage SVG
    contributors = {}
    if args.translator:
        contributors["translated by"] = args.translator

    if args.illustrator:
        contributors["illustrated by"] = args.illustrator

    with open(repo_name / "images" / "titlepage.svg", "w",
              encoding="utf-8") as file:
        file.write(
            _generate_titlepage_svg(args.title, args.author, contributors,
                                    title_string))

    # Create the cover SVG
    with open(repo_name / "images" / "cover.svg", "w",
              encoding="utf-8") as file:
        file.write(_generate_cover_svg(args.title, args.author, title_string))

    if args.pg_url:
        se.replace_in_file(
            repo_name / "src" / "epub" / "text" / "imprint.xhtml", "PG_URL",
            args.pg_url)

    with open(repo_name / "src" / "epub" / "text" / "colophon.xhtml",
              "r+",
              encoding="utf-8") as file:
        colophon_xhtml = file.read()

        colophon_xhtml = colophon_xhtml.replace("SE_IDENTIFIER", identifier)
        colophon_xhtml = colophon_xhtml.replace(">AUTHOR<",
                                                ">{}<".format(args.author))
        colophon_xhtml = colophon_xhtml.replace("TITLE", args.title)

        if author_wiki_url:
            colophon_xhtml = colophon_xhtml.replace("AUTHOR_WIKI_URL",
                                                    author_wiki_url)

        if args.pg_url:
            colophon_xhtml = colophon_xhtml.replace("PG_URL", args.pg_url)

            if pg_publication_year:
                colophon_xhtml = colophon_xhtml.replace(
                    "PG_YEAR", pg_publication_year)

            if pg_producers:
                producers_xhtml = ""
                for i, producer in enumerate(pg_producers):
                    if "Distributed Proofreading" in producer:
                        producers_xhtml = producers_xhtml + "<a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a>"
                    else:
                        producers_xhtml = producers_xhtml + "<b class=\"name\">{}</b>".format(
                            producer)

                    if i < len(pg_producers) - 1:
                        producers_xhtml = producers_xhtml + ", "

                    if i == len(pg_producers) - 2:
                        producers_xhtml = producers_xhtml + "and "

                producers_xhtml = producers_xhtml + "<br/>"

                colophon_xhtml = colophon_xhtml.replace(
                    "<b class=\"name\">TRANSCRIBER_1</b>, <b class=\"name\">TRANSCRIBER_2</b>, and <a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a><br/>",
                    producers_xhtml)

        file.seek(0)
        file.write(colophon_xhtml)
        file.truncate()

    with open(repo_name / "src" / "epub" / "content.opf",
              "r+",
              encoding="utf-8") as file:
        metadata_xhtml = file.read()

        metadata_xhtml = metadata_xhtml.replace("SE_IDENTIFIER", identifier)
        metadata_xhtml = metadata_xhtml.replace(">AUTHOR<",
                                                ">{}<".format(args.author))
        metadata_xhtml = metadata_xhtml.replace(">TITLE_SORT<",
                                                ">{}<".format(sorted_title))
        metadata_xhtml = metadata_xhtml.replace(">TITLE<",
                                                ">{}<".format(args.title))
        metadata_xhtml = metadata_xhtml.replace("VCS_IDENTIFIER",
                                                str(repo_name))

        if pg_producers:
            producers_xhtml = ""
            i = 1
            for producer in pg_producers:
                producers_xhtml = producers_xhtml + "\t\t<dc:contributor id=\"transcriber-{}\">{}</dc:contributor>\n".format(
                    i, producer)

                if "Distributed Proofreading" in producer:
                    producers_xhtml = producers_xhtml + "\t\t<meta property=\"file-as\" refines=\"#transcriber-{0}\">Online Distributed Proofreading Team, The</meta>\n\t\t<meta property=\"se:url.homepage\" refines=\"#transcriber-{0}\">https://pgdp.net</meta>\n".format(
                        i)
                else:
                    producers_xhtml = producers_xhtml + "\t\t<meta property=\"file-as\" refines=\"#transcriber-{}\">TRANSCRIBER_SORT</meta>\n".format(
                        i)

                producers_xhtml = producers_xhtml + "\t\t<meta property=\"role\" refines=\"#transcriber-{}\" scheme=\"marc:relators\">trc</meta>\n".format(
                    i)

                i = i + 1

            metadata_xhtml = regex.sub(
                r"\t\t<dc:contributor id=\"transcriber-1\">TRANSCRIBER</dc:contributor>\s*<meta property=\"file-as\" refines=\"#transcriber-1\">TRANSCRIBER_SORT</meta>\s*<meta property=\"se:url.homepage\" refines=\"#transcriber-1\">TRANSCRIBER_URL</meta>\s*<meta property=\"role\" refines=\"#transcriber-1\" scheme=\"marc:relators\">trc</meta>",
                "\t\t" + producers_xhtml.strip(),
                metadata_xhtml,
                flags=regex.DOTALL)

        if author_wiki_url:
            metadata_xhtml = metadata_xhtml.replace(
                ">AUTHOR_WIKI_URL<", ">{}<".format(author_wiki_url))

        if author_nacoaf_url:
            metadata_xhtml = metadata_xhtml.replace(
                ">AUTHOR_NACOAF_URL<", ">{}<".format(author_nacoaf_url))

        if ebook_wiki_url:
            metadata_xhtml = metadata_xhtml.replace(
                ">EBOOK_WIKI_URL<", ">{}<".format(ebook_wiki_url))

        if args.translator:
            metadata_xhtml = metadata_xhtml.replace(
                ">TRANSLATOR<", ">{}<".format(args.translator))

            if translator_wiki_url:
                metadata_xhtml = metadata_xhtml.replace(
                    ">TRANSLATOR_WIKI_URL<",
                    ">{}<".format(translator_wiki_url))

            if translator_nacoaf_url:
                metadata_xhtml = metadata_xhtml.replace(
                    ">TRANSLATOR_NACOAF_URL<",
                    ">{}<".format(translator_nacoaf_url))
        else:
            metadata_xhtml = regex.sub(
                r"<dc:contributor id=\"translator\">.+?<dc:contributor id=\"artist\">",
                "<dc:contributor id=\"artist\">",
                metadata_xhtml,
                flags=regex.DOTALL)

        if args.pg_url:
            if pg_subjects:
                subject_xhtml = ""

                i = 1
                for subject in pg_subjects:
                    subject_xhtml = subject_xhtml + "\t\t<dc:subject id=\"subject-{}\">{}</dc:subject>\n".format(
                        i, subject)
                    i = i + 1

                i = 1
                for subject in pg_subjects:
                    subject_xhtml = subject_xhtml + "\t\t<meta property=\"authority\" refines=\"#subject-{}\">LCSH</meta>\n".format(
                        i)

                    # Now, get the LCSH ID by querying LCSH directly.
                    try:
                        response = requests.get(
                            "http://id.loc.gov/search/?q=%22{}%22".format(
                                urllib.parse.quote(subject)))
                        result = regex.search(
                            r"<a title=\"Click to view record\" href=\"/authorities/subjects/([^\"]+?)\">{}</a>"
                            .format(regex.escape(subject.replace(" -- ",
                                                                 "--"))),
                            response.text)

                        loc_id = "Unknown"
                        try:
                            loc_id = result.group(1)
                        except Exception as ex:
                            pass

                        subject_xhtml = subject_xhtml + "\t\t<meta property=\"term\" refines=\"#subject-{}\">{}</meta>\n".format(
                            i, loc_id)

                    except Exception as ex:
                        raise se.RemoteCommandErrorException(
                            "Couldn’t connect to id.loc.gov. Error: {}".format(
                                ex))

                    i = i + 1

                metadata_xhtml = regex.sub(
                    r"\t\t<dc:subject id=\"subject-1\">SUBJECT_1</dc:subject>\s*<dc:subject id=\"subject-2\">SUBJECT_2</dc:subject>\s*<meta property=\"authority\" refines=\"#subject-1\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-1\">LCSH_ID_1</meta>\s*<meta property=\"authority\" refines=\"#subject-2\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-2\">LCSH_ID_2</meta>",
                    "\t\t" + subject_xhtml.strip(), metadata_xhtml)

            metadata_xhtml = metadata_xhtml.replace(
                "<dc:language>LANG</dc:language>",
                "<dc:language>{}</dc:language>".format(pg_language))
            metadata_xhtml = metadata_xhtml.replace(
                "<dc:source>PG_URL</dc:source>",
                "<dc:source>{}</dc:source>".format(args.pg_url))

        file.seek(0)
        file.write(metadata_xhtml)
        file.truncate()

    # Set up local git repo
    repo = git.Repo.init(repo_name)

    if args.email:
        with repo.config_writer() as config:
            config.set_value("user", "email", args.email)

    # Set up remote git repos
    if args.create_se_repo:
        git_command = git.cmd.Git(repo_name)
        git_command.remote(
            "add", "origin",
            "standardebooks.org:/standardebooks.org/ebooks/{}.git".format(
                repo_name))

        # Set git to automatically push to SE
        git_command.config("branch.master.remote", "origin")
        git_command.config("branch.master.merge", "refs/heads/master")

        github_option = ""
        if args.create_github_repo:
            github_option = "--github"

        return_code = call([
            "ssh", "standardebooks.org",
            "/standardebooks.org/scripts/init-se-repo --repo-name={} --title-string=\"{}\" {}"
            .format(repo_name, title_string, github_option)
        ])
        if return_code != 0:
            raise se.RemoteCommandErrorException(
                "Failed to create repository on Standard Ebooks server: ssh returned code {}."
                .format(return_code))