Пример #1
0
def _create_draft(args: Namespace):
    """
	Implementation for `se create-draft`
	"""

    # Put together some variables for later use
    identifier = se.formatting.make_url_safe(
        args.author) + "/" + se.formatting.make_url_safe(args.title)
    title_string = args.title.replace(
        "'", "’") + ", by " + args.author.replace("'", "’")
    sorted_title = regex.sub(r"^(A|An|The) (.+)$", "\\2, \\1", args.title)
    pg_producers = []

    if args.translator:
        identifier = identifier + "/" + se.formatting.make_url_safe(
            args.translator)
        title_string = title_string + ". Translated by " + args.translator

    if args.illustrator:
        identifier = identifier + "/" + se.formatting.make_url_safe(
            args.illustrator)
        title_string = title_string + ". Illustrated by " + args.illustrator

    repo_name = identifier.replace("/", "_")

    repo_path = Path(repo_name).resolve()

    if repo_path.is_dir():
        raise se.InvalidInputException(
            f"Directory already exists: [path][link=file://{repo_path}]{repo_path}[/][/]."
        )

    # Download PG HTML and do some fixups
    if args.pg_url:
        if args.offline:
            raise se.RemoteCommandErrorException(
                "Cannot download Project Gutenberg ebook when offline option is enabled."
            )

        args.pg_url = args.pg_url.replace("http://", "https://")

        # Get the ebook metadata
        try:
            response = requests.get(args.pg_url)
            pg_metadata_html = response.text
        except Exception as ex:
            raise se.RemoteCommandErrorException(
                f"Couldn’t download Project Gutenberg ebook metadata page. Exception: {ex}"
            )

        soup = BeautifulSoup(pg_metadata_html, "lxml")

        # Get the ebook HTML URL from the metadata
        pg_ebook_url = None
        for element in soup.select("a[type^=\"text/html\"]"):
            pg_ebook_url = regex.sub(r"^//", "https://", element["href"])
            pg_ebook_url = regex.sub(r"^/", "https://www.gutenberg.org/",
                                     pg_ebook_url)

        if not pg_ebook_url:
            raise se.RemoteCommandErrorException(
                "Could download ebook metadata, but couldn’t find URL for the ebook HTML."
            )

        # Get the ebook LCSH categories
        pg_subjects = []
        for element in soup.select("td[property=\"dcterms:subject\"]"):
            if element["datatype"] == "dcterms:LCSH":
                for subject_link in element.find("a"):
                    pg_subjects.append(subject_link.strip())

        # Get the PG publication date
        pg_publication_year = None
        for element in soup.select("td[itemprop=\"datePublished\"]"):
            pg_publication_year = regex.sub(r".+?([0-9]{4})", "\\1",
                                            element.text)

        # Get the actual ebook URL
        try:
            response = requests.get(pg_ebook_url)
            pg_ebook_html = response.text
        except Exception as ex:
            raise se.RemoteCommandErrorException(
                f"Couldn’t download Project Gutenberg ebook HTML. Exception: {ex}"
            )

        try:
            fixed_pg_ebook_html = fix_text(pg_ebook_html, uncurl_quotes=False)
            pg_ebook_html = se.strip_bom(fixed_pg_ebook_html)
        except Exception as ex:
            raise se.InvalidEncodingException(
                f"Couldn’t determine text encoding of Project Gutenberg HTML file. Exception: {ex}"
            )

        # Try to guess the ebook language
        pg_language = "en-US"
        if "colour" in pg_ebook_html or "favour" in pg_ebook_html or "honour" in pg_ebook_html:
            pg_language = "en-GB"

    # Create necessary directories
    (repo_path / "images").mkdir(parents=True)
    (repo_path / "src" / "epub" / "css").mkdir(parents=True)
    (repo_path / "src" / "epub" / "images").mkdir(parents=True)
    (repo_path / "src" / "epub" / "text").mkdir(parents=True)
    (repo_path / "src" / "META-INF").mkdir(parents=True)

    is_pg_html_parsed = True

    # Write PG data if we have it
    if args.pg_url and pg_ebook_html:
        try:
            soup = BeautifulSoup(pg_ebook_html, "html.parser")

            # Try to get the PG producers.  We only try this if there's a <pre> block with the header info (which is not always the case)
            for element in soup(text=regex.compile(r"\*\*\*\s*Produced by.+$",
                                                   flags=regex.DOTALL)):
                if element.parent.name == "pre":
                    producers_text = regex.sub(r".+?Produced by (.+?)\s*$",
                                               "\\1",
                                               element,
                                               flags=regex.DOTALL)
                    producers_text = regex.sub(r"\(.+?\)",
                                               "",
                                               producers_text,
                                               flags=regex.DOTALL)
                    producers_text = regex.sub(
                        r"(at )?https?://www\.pgdp\.net",
                        "",
                        producers_text,
                        flags=regex.DOTALL)
                    producers_text = regex.sub(r"[\r\n]+",
                                               " ",
                                               producers_text,
                                               flags=regex.DOTALL)
                    producers_text = regex.sub(r",? and ", ", and ",
                                               producers_text)
                    producers_text = producers_text.replace(
                        " and the Online", " and The Online")
                    producers_text = producers_text.replace(", and ",
                                                            ", ").strip()

                    pg_producers = producers_text.split(", ")

            # Try to strip out the PG header
            for element in soup(text=regex.compile(r"\*\*\*\s*START OF THIS")):
                for sibling in element.parent.find_previous_siblings():
                    sibling.decompose()

                element.parent.decompose()

            # Try to strip out the PG license footer
            for element in soup(
                    text=regex.compile(r"End of (the )?Project Gutenberg")):
                for sibling in element.parent.find_next_siblings():
                    sibling.decompose()

                element.parent.decompose()

            with open(repo_path / "src" / "epub" / "text" / "body.xhtml",
                      "w",
                      encoding="utf-8") as file:
                file.write(str(soup))
        except OSError as ex:
            raise se.InvalidFileException(
                f"Couldn’t write to ebook directory. Exception: {ex}")
        except:
            # Save this error for later, because it's still useful to complete the create-draft process
            # even if we've failed to parse PG's HTML source.
            is_pg_html_parsed = False
            se.quiet_remove(repo_path / "src" / "epub" / "text" / "body.xhtml")

    # Copy over templates

    _copy_template_file("gitignore", repo_path / ".gitignore")
    _copy_template_file("LICENSE.md", repo_path)
    _copy_template_file("container.xml", repo_path / "src" / "META-INF")
    _copy_template_file("mimetype", repo_path / "src")
    _copy_template_file("content.opf", repo_path / "src" / "epub")
    _copy_template_file("onix.xml", repo_path / "src" / "epub")
    _copy_template_file("toc.xhtml", repo_path / "src" / "epub")
    _copy_template_file("core.css", repo_path / "src" / "epub" / "css")
    _copy_template_file("local.css", repo_path / "src" / "epub" / "css")
    _copy_template_file("logo.svg", repo_path / "src" / "epub" / "images")
    _copy_template_file("colophon.xhtml", repo_path / "src" / "epub" / "text")
    _copy_template_file("imprint.xhtml", repo_path / "src" / "epub" / "text")
    _copy_template_file("titlepage.xhtml", repo_path / "src" / "epub" / "text")
    _copy_template_file("uncopyright.xhtml",
                        repo_path / "src" / "epub" / "text")
    _copy_template_file("titlepage.svg", repo_path / "images")
    _copy_template_file("cover.jpg", repo_path / "images" / "cover.jpg")
    _copy_template_file("cover.svg", repo_path / "images" / "cover.svg")

    # Try to find Wikipedia links if possible
    if args.offline:
        author_wiki_url = None
        author_nacoaf_url = None
        ebook_wiki_url = None
        translator_wiki_url = None
        translator_nacoaf_url = None
    else:
        author_wiki_url, author_nacoaf_url = _get_wikipedia_url(
            args.author, True)
        ebook_wiki_url = None
        if args.title != "Short Fiction":
            # There's a "Short Fiction" Wikipedia article, so make an exception for that case
            ebook_wiki_url, _ = _get_wikipedia_url(args.title, False)
        translator_wiki_url = None
        if args.translator:
            translator_wiki_url, translator_nacoaf_url = _get_wikipedia_url(
                args.translator, True)

    # Pre-fill a few templates
    _replace_in_file(repo_path / "src" / "epub" / "text" / "titlepage.xhtml",
                     "TITLE_STRING", title_string)
    _replace_in_file(repo_path / "images" / "titlepage.svg", "TITLE_STRING",
                     title_string)
    _replace_in_file(repo_path / "images" / "cover.svg", "TITLE_STRING",
                     title_string)

    # Create the titlepage SVG
    contributors = {}
    if args.translator:
        contributors["translated by"] = args.translator

    if args.illustrator:
        contributors["illustrated by"] = args.illustrator

    with open(repo_path / "images" / "titlepage.svg", "w",
              encoding="utf-8") as file:
        file.write(
            _generate_titlepage_svg(args.title, args.author, contributors,
                                    title_string))

    # Create the cover SVG
    with open(repo_path / "images" / "cover.svg", "w",
              encoding="utf-8") as file:
        file.write(_generate_cover_svg(args.title, args.author, title_string))

    # Build the cover/titlepage for distribution
    epub = SeEpub(repo_path)
    epub.generate_cover_svg()
    epub.generate_titlepage_svg()

    if args.pg_url:
        _replace_in_file(repo_path / "src" / "epub" / "text" / "imprint.xhtml",
                         "PG_URL", args.pg_url)

    with open(repo_path / "src" / "epub" / "text" / "colophon.xhtml",
              "r+",
              encoding="utf-8") as file:
        colophon_xhtml = file.read()

        colophon_xhtml = colophon_xhtml.replace("SE_IDENTIFIER", identifier)
        colophon_xhtml = colophon_xhtml.replace(">AUTHOR<", f">{args.author}<")
        colophon_xhtml = colophon_xhtml.replace("TITLE", args.title)

        if author_wiki_url:
            colophon_xhtml = colophon_xhtml.replace("AUTHOR_WIKI_URL",
                                                    author_wiki_url)

        if args.pg_url:
            colophon_xhtml = colophon_xhtml.replace("PG_URL", args.pg_url)

            if pg_publication_year:
                colophon_xhtml = colophon_xhtml.replace(
                    "PG_YEAR", pg_publication_year)

            if pg_producers:
                producers_xhtml = ""
                for i, producer in enumerate(pg_producers):
                    if "Distributed Proofread" in producer:
                        producers_xhtml = producers_xhtml + "<a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a>"
                    elif "anonymous" in producer.lower():
                        producers_xhtml = producers_xhtml + "<b class=\"name\">An Anonymous Volunteer</b>"
                    else:
                        producers_xhtml = producers_xhtml + f"<b class=\"name\">{producer.strip('.')}</b>"

                    if i < len(pg_producers) - 1:
                        producers_xhtml = producers_xhtml + ", "

                    if i == len(pg_producers) - 2:
                        producers_xhtml = producers_xhtml + "and "

                producers_xhtml = producers_xhtml + "<br/>"

                colophon_xhtml = colophon_xhtml.replace(
                    "<b class=\"name\">TRANSCRIBER_1</b>, <b class=\"name\">TRANSCRIBER_2</b>, and <a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a><br/>",
                    producers_xhtml)

        file.seek(0)
        file.write(colophon_xhtml)
        file.truncate()

    with open(repo_path / "src" / "epub" / "content.opf",
              "r+",
              encoding="utf-8") as file:
        metadata_xml = file.read()

        metadata_xml = metadata_xml.replace("SE_IDENTIFIER", identifier)
        metadata_xml = metadata_xml.replace(">AUTHOR<", f">{args.author}<")
        metadata_xml = metadata_xml.replace(">TITLE_SORT<",
                                            f">{sorted_title}<")
        metadata_xml = metadata_xml.replace(">TITLE<", f">{args.title}<")
        metadata_xml = metadata_xml.replace("VCS_IDENTIFIER", str(repo_name))

        if pg_producers:
            producers_xhtml = ""
            i = 1
            for producer in pg_producers:
                if "Distributed Proofread" in producer:
                    producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">The Online Distributed Proofreading Team</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Online Distributed Proofreading Team, The</meta>\n\t\t<meta property=\"se:url.homepage\" refines=\"#transcriber-{i}\">https://pgdp.net</meta>\n"
                elif "anonymous" in producer.lower():
                    producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">An Anonymous Volunteer</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Anonymous Volunteer, An</meta>\n"
                else:
                    producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">{producer.strip('.')}</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">TRANSCRIBER_SORT</meta>\n"

                producers_xhtml = producers_xhtml + f"\t\t<meta property=\"role\" refines=\"#transcriber-{i}\" scheme=\"marc:relators\">trc</meta>\n"

                i = i + 1

            metadata_xml = regex.sub(
                r"\t\t<dc:contributor id=\"transcriber-1\">TRANSCRIBER</dc:contributor>\s*<meta property=\"file-as\" refines=\"#transcriber-1\">TRANSCRIBER_SORT</meta>\s*<meta property=\"se:url.homepage\" refines=\"#transcriber-1\">TRANSCRIBER_URL</meta>\s*<meta property=\"role\" refines=\"#transcriber-1\" scheme=\"marc:relators\">trc</meta>",
                "\t\t" + producers_xhtml.strip(),
                metadata_xml,
                flags=regex.DOTALL)

        if author_wiki_url:
            metadata_xml = metadata_xml.replace(">AUTHOR_WIKI_URL<",
                                                f">{author_wiki_url}<")

        if author_nacoaf_url:
            metadata_xml = metadata_xml.replace(">AUTHOR_NACOAF_URL<",
                                                f">{author_nacoaf_url}<")

        if ebook_wiki_url:
            metadata_xml = metadata_xml.replace(">EBOOK_WIKI_URL<",
                                                f">{ebook_wiki_url}<")

        if args.translator:
            metadata_xml = metadata_xml.replace(">TRANSLATOR<",
                                                f">{args.translator}<")

            if translator_wiki_url:
                metadata_xml = metadata_xml.replace(
                    ">TRANSLATOR_WIKI_URL<", f">{translator_wiki_url}<")

            if translator_nacoaf_url:
                metadata_xml = metadata_xml.replace(
                    ">TRANSLATOR_NACOAF_URL<", f">{translator_nacoaf_url}<")
        else:
            metadata_xml = regex.sub(
                r"<dc:contributor id=\"translator\">.+?<dc:contributor id=\"artist\">",
                "<dc:contributor id=\"artist\">",
                metadata_xml,
                flags=regex.DOTALL)

        if args.pg_url:
            if pg_subjects:
                subject_xhtml = ""

                i = 1
                for subject in pg_subjects:
                    subject_xhtml = subject_xhtml + f"\t\t<dc:subject id=\"subject-{i}\">{subject}</dc:subject>\n"
                    i = i + 1

                i = 1
                for subject in pg_subjects:
                    subject_xhtml = subject_xhtml + f"\t\t<meta property=\"authority\" refines=\"#subject-{i}\">LCSH</meta>\n"

                    # Now, get the LCSH ID by querying LCSH directly.
                    try:
                        response = requests.get(
                            f"https://id.loc.gov/search/?q=%22{urllib.parse.quote(subject)}%22"
                        )
                        result = regex.search(
                            fr"<a title=\"Click to view record\" href=\"/authorities/subjects/([^\"]+?)\">{regex.escape(subject.replace(' -- ', '--'))}</a>",
                            response.text)

                        loc_id = "Unknown"
                        try:
                            loc_id = result.group(1)
                        except Exception as ex:
                            pass

                        subject_xhtml = subject_xhtml + f"\t\t<meta property=\"term\" refines=\"#subject-{i}\">{loc_id}</meta>\n"

                    except Exception as ex:
                        raise se.RemoteCommandErrorException(
                            f"Couldn’t connect to [url][link=https://id.loc.gov]https://id.loc.gov[/][/]. Exception: {ex}"
                        )

                    i = i + 1

                metadata_xml = regex.sub(
                    r"\t\t<dc:subject id=\"subject-1\">SUBJECT_1</dc:subject>\s*<dc:subject id=\"subject-2\">SUBJECT_2</dc:subject>\s*<meta property=\"authority\" refines=\"#subject-1\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-1\">LCSH_ID_1</meta>\s*<meta property=\"authority\" refines=\"#subject-2\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-2\">LCSH_ID_2</meta>",
                    "\t\t" + subject_xhtml.strip(), metadata_xml)

            metadata_xml = metadata_xml.replace(
                "<dc:language>LANG</dc:language>",
                f"<dc:language>{pg_language}</dc:language>")
            metadata_xml = metadata_xml.replace(
                "<dc:source>PG_URL</dc:source>",
                f"<dc:source>{args.pg_url}</dc:source>")

        file.seek(0)
        file.write(metadata_xml)
        file.truncate()

    # Set up local git repo
    repo = git.Repo.init(repo_path)

    if args.email:
        with repo.config_writer() as config:
            config.set_value("user", "email", args.email)

    if args.pg_url and pg_ebook_html and not is_pg_html_parsed:
        raise se.InvalidXhtmlException(
            "Couldn’t parse Project Gutenberg ebook source. This is usually due to invalid HTML in the ebook."
        )
Пример #2
0
def _create_draft(args: Namespace):
    """
	Implementation for `se create-draft`
	"""

    # Put together some variables for later use
    authors = []
    translators = []
    illustrators = []
    pg_producers = []
    title = args.title.replace("'", "’")

    for author in args.author:
        authors.append({
            "name": author.replace("'", "’"),
            "wiki_url": None,
            "nacoaf_url": None
        })

    if args.translator:
        for translator in args.translator:
            translators.append({
                "name": translator.replace("'", "’"),
                "wiki_url": None,
                "nacoaf_url": None
            })

    if args.illustrator:
        for illustrator in args.illustrator:
            illustrators.append({
                "name": illustrator.replace("'", "’"),
                "wiki_url": None,
                "nacoaf_url": None
            })

    title_string = title
    if authors and authors[0]["name"].lower() != "anonymous":
        title_string += ", by " + _generate_contributor_string(authors, False)

    identifier = ""
    for author in authors:
        identifier += se.formatting.make_url_safe(author["name"]) + "_"

    identifier = identifier.rstrip("_") + "/" + se.formatting.make_url_safe(
        title)

    sorted_title = regex.sub(r"^(A|An|The) (.+)$", "\\2, \\1", title)

    if translators:
        title_string = title_string + ". Translated by " + _generate_contributor_string(
            translators, False)

        identifier = identifier + "/"

        for translator in translators:
            identifier += se.formatting.make_url_safe(translator["name"]) + "_"

        identifier = identifier.rstrip("_")

    if illustrators:
        title_string = title_string + ". Illustrated by " + _generate_contributor_string(
            illustrators, False)

        identifier = identifier + "/"

        for illustrator in illustrators:
            identifier += se.formatting.make_url_safe(
                illustrator["name"]) + "_"

        identifier = identifier.rstrip("_")

    repo_name = identifier.replace("/", "_")

    repo_path = Path(repo_name).resolve()

    if repo_path.is_dir():
        raise se.InvalidInputException(
            f"Directory already exists: [path][link=file://{repo_path}]{repo_path}[/][/]."
        )

    # Get data on authors
    for i, author in enumerate(authors):
        if not args.offline and author["name"].lower() != "anonymous":
            author["wiki_url"], author["nacoaf_url"] = _get_wikipedia_url(
                author["name"], True)

    # Get data on translators
    for i, translator in enumerate(translators):
        if not args.offline and translator["name"].lower() != "anonymous":
            translator["wiki_url"], translator[
                "nacoaf_url"] = _get_wikipedia_url(translator["name"], True)

    # Get data on illlustrators
    for i, illustrator in enumerate(illustrators):
        if not args.offline and illustrator["name"].lower() != "anonymous":
            illustrator["wiki_url"], illustrator[
                "nacoaf_url"] = _get_wikipedia_url(illustrator["name"], True)

    # Download PG HTML and do some fixups
    if args.pg_url:
        if args.offline:
            raise se.RemoteCommandErrorException(
                "Cannot download Project Gutenberg ebook when offline option is enabled."
            )

        args.pg_url = args.pg_url.replace("http://", "https://")

        # Get the ebook metadata
        try:
            response = requests.get(args.pg_url)
            pg_metadata_html = response.text
        except Exception as ex:
            raise se.RemoteCommandErrorException(
                f"Couldn’t download Project Gutenberg ebook metadata page. Exception: {ex}"
            )

        parser = etree.HTMLParser()
        dom = etree.parse(StringIO(pg_metadata_html), parser)

        # Get the ebook HTML URL from the metadata
        pg_ebook_url = None
        for node in dom.xpath("/html/body//a[contains(@type, 'text/html')]"):
            pg_ebook_url = regex.sub(r"^//", "https://", node.get("href"))
            pg_ebook_url = regex.sub(r"^/", "https://www.gutenberg.org/",
                                     pg_ebook_url)

        if not pg_ebook_url:
            raise se.RemoteCommandErrorException(
                "Could download ebook metadata, but couldn’t find URL for the ebook HTML."
            )

        # Get the ebook LCSH categories
        pg_subjects = []
        for node in dom.xpath(
                "/html/body//td[contains(@property, 'dcterms:subject')]"):
            if node.get("datatype") == "dcterms:LCSH":
                for subject_link in node.xpath("./a"):
                    pg_subjects.append(subject_link.text.strip())

        # Get the PG publication date
        pg_publication_year = None
        for node in dom.xpath("//td[@itemprop='datePublished']"):
            pg_publication_year = regex.sub(r".+?([0-9]{4})", "\\1", node.text)

        # Get the actual ebook URL
        try:
            response = requests.get(pg_ebook_url)
            pg_ebook_html = response.text
        except Exception as ex:
            raise se.RemoteCommandErrorException(
                f"Couldn’t download Project Gutenberg ebook HTML. Exception: {ex}"
            )

        try:
            fixed_pg_ebook_html = fix_text(pg_ebook_html, uncurl_quotes=False)
            pg_ebook_html = se.strip_bom(fixed_pg_ebook_html)
        except Exception as ex:
            raise se.InvalidEncodingException(
                f"Couldn’t determine text encoding of Project Gutenberg HTML file. Exception: {ex}"
            )

        # Try to guess the ebook language
        pg_language = "en-US"
        if "colour" in pg_ebook_html or "favour" in pg_ebook_html or "honour" in pg_ebook_html:
            pg_language = "en-GB"

    # Create necessary directories
    (repo_path / "images").mkdir(parents=True)
    (repo_path / "src" / "epub" / "css").mkdir(parents=True)
    (repo_path / "src" / "epub" / "images").mkdir(parents=True)
    (repo_path / "src" / "epub" / "text").mkdir(parents=True)
    (repo_path / "src" / "META-INF").mkdir(parents=True)

    is_pg_html_parsed = True

    # Write PG data if we have it
    if args.pg_url and pg_ebook_html:
        try:
            dom = etree.parse(
                StringIO(regex.sub(r"encoding=\".+?\"", "", pg_ebook_html)),
                parser)
            namespaces = {"re": "http://exslt.org/regular-expressions"}

            for node in dom.xpath(
                    "//*[re:test(text(), '\\*\\*\\*\\s*Produced by.+')]",
                    namespaces=namespaces):
                producers_text = regex.sub(
                    r"^<[^>]+?>", "",
                    etree.tostring(node, encoding=str, with_tail=False))
                producers_text = regex.sub(r"<[^>]+?>$", "", producers_text)

                producers_text = regex.sub(r".+?Produced by (.+?)\s*$",
                                           "\\1",
                                           producers_text,
                                           flags=regex.DOTALL)
                producers_text = regex.sub(r"\(.+?\)",
                                           "",
                                           producers_text,
                                           flags=regex.DOTALL)
                producers_text = regex.sub(r"(at )?https?://www\.pgdp\.net",
                                           "",
                                           producers_text,
                                           flags=regex.DOTALL)
                producers_text = regex.sub(r"[\r\n]+",
                                           " ",
                                           producers_text,
                                           flags=regex.DOTALL)
                producers_text = regex.sub(r",? and ", ", and ",
                                           producers_text)
                producers_text = producers_text.replace(
                    " and the Online", " and The Online")
                producers_text = producers_text.replace(", and ", ", ").strip()

                pg_producers = [
                    producer.strip()
                    for producer in regex.split(',|;', producers_text)
                ]

            # Try to strip out the PG header
            for node in dom.xpath(
                    "//*[re:test(text(), '\\*\\*\\*\\s*START OF THIS')]",
                    namespaces=namespaces):
                for sibling_node in node.xpath("./preceding-sibling::*"):
                    easy_node = se.easy_xml.EasyXmlElement(sibling_node)
                    easy_node.remove()

                easy_node = se.easy_xml.EasyXmlElement(node)
                easy_node.remove()

            # Try to strip out the PG license footer
            for node in dom.xpath(
                    "//*[re:test(text(), 'End of (the )?Project Gutenberg')]",
                    namespaces=namespaces):
                for sibling_node in node.xpath("./following-sibling::*"):
                    easy_node = se.easy_xml.EasyXmlElement(sibling_node)
                    easy_node.remove()

                easy_node = se.easy_xml.EasyXmlElement(node)
                easy_node.remove()

            # lxml will but the xml declaration in a weird place, remove it first
            output = regex.sub(r"<\?xml.+?\?>", "",
                               etree.tostring(dom, encoding="unicode"))

            # Now re-add it
            output = """<?xml version="1.0" encoding="utf-8"?>\n""" + output

            # lxml can also output duplicate default namespace declarations so remove the first one only
            output = regex.sub(r"(xmlns=\".+?\")(\sxmlns=\".+?\")+", r"\1",
                               output)

            with open(repo_path / "src" / "epub" / "text" / "body.xhtml",
                      "w",
                      encoding="utf-8") as file:
                file.write(output)

        except OSError as ex:
            raise se.InvalidFileException(
                f"Couldn’t write to ebook directory. Exception: {ex}")
        except Exception as ex:
            # Save this error for later, because it's still useful to complete the create-draft process
            # even if we've failed to parse PG's HTML source.
            is_pg_html_parsed = False
            se.quiet_remove(repo_path / "src" / "epub" / "text" / "body.xhtml")

    # Copy over templates
    _copy_template_file("gitignore", repo_path / ".gitignore")
    _copy_template_file("LICENSE.md", repo_path)
    _copy_template_file("container.xml", repo_path / "src" / "META-INF")
    _copy_template_file("mimetype", repo_path / "src")
    _copy_template_file("content.opf", repo_path / "src" / "epub")
    _copy_template_file("onix.xml", repo_path / "src" / "epub")
    _copy_template_file("toc.xhtml", repo_path / "src" / "epub")
    _copy_template_file("core.css", repo_path / "src" / "epub" / "css")
    _copy_template_file("local.css", repo_path / "src" / "epub" / "css")
    _copy_template_file("se.css", repo_path / "src" / "epub" / "css")
    _copy_template_file("logo.svg", repo_path / "src" / "epub" / "images")
    _copy_template_file("colophon.xhtml", repo_path / "src" / "epub" / "text")
    _copy_template_file("imprint.xhtml", repo_path / "src" / "epub" / "text")
    _copy_template_file("titlepage.xhtml", repo_path / "src" / "epub" / "text")
    _copy_template_file("uncopyright.xhtml",
                        repo_path / "src" / "epub" / "text")
    _copy_template_file("titlepage.svg", repo_path / "images")
    _copy_template_file("cover.jpg", repo_path / "images" / "cover.jpg")
    _copy_template_file("cover.svg", repo_path / "images" / "cover.svg")

    # Try to find Wikipedia links if possible
    ebook_wiki_url = None

    if not args.offline and title != "Short Fiction":
        # There's a "Short Fiction" Wikipedia article, so make an exception for that case
        ebook_wiki_url, _ = _get_wikipedia_url(title, False)

    # Pre-fill a few templates
    _replace_in_file(repo_path / "src" / "epub" / "text" / "titlepage.xhtml",
                     "TITLE_STRING", title_string)
    _replace_in_file(repo_path / "images" / "titlepage.svg", "TITLE_STRING",
                     title_string)
    _replace_in_file(repo_path / "images" / "cover.svg", "TITLE_STRING",
                     title_string)

    # Create the titlepage SVG
    contributors = {}
    if args.translator:
        contributors["translated by"] = _generate_contributor_string(
            translators, False)

    if args.illustrator:
        contributors["illustrated by"] = _generate_contributor_string(
            illustrators, False)

    with open(repo_path / "images" / "titlepage.svg", "w",
              encoding="utf-8") as file:
        file.write(
            _generate_titlepage_svg(title,
                                    [author["name"] for author in authors],
                                    contributors, title_string))

    # Create the cover SVG
    with open(repo_path / "images" / "cover.svg", "w",
              encoding="utf-8") as file:
        file.write(
            _generate_cover_svg(title, [author["name"] for author in authors],
                                title_string))

    # Build the cover/titlepage for distribution
    epub = SeEpub(repo_path)
    epub.generate_cover_svg()
    epub.generate_titlepage_svg()

    if args.pg_url:
        _replace_in_file(repo_path / "src" / "epub" / "text" / "imprint.xhtml",
                         "PG_URL", args.pg_url)

    # Fill out the colophon
    with open(repo_path / "src" / "epub" / "text" / "colophon.xhtml",
              "r+",
              encoding="utf-8") as file:
        colophon_xhtml = file.read()

        colophon_xhtml = colophon_xhtml.replace("SE_IDENTIFIER", identifier)
        colophon_xhtml = colophon_xhtml.replace("TITLE", title)

        contributor_string = _generate_contributor_string(authors, True)

        if contributor_string == "":
            colophon_xhtml = colophon_xhtml.replace(
                " by<br/>\n\t\t\t<a href=\"AUTHOR_WIKI_URL\">AUTHOR</a>",
                contributor_string)
        else:
            colophon_xhtml = colophon_xhtml.replace(
                "<a href=\"AUTHOR_WIKI_URL\">AUTHOR</a>", contributor_string)

        if translators:
            translator_block = f"It was translated from ORIGINAL_LANGUAGE in TRANSLATION_YEAR by<br/>\n\t\t\t{_generate_contributor_string(translators, True)}.</p>"
            colophon_xhtml = colophon_xhtml.replace(
                "</p>\n\t\t\t<p>This ebook was produced for the<br/>",
                f"<br/>\n\t\t\t{translator_block}\n\t\t\t<p>This ebook was produced for the<br/>"
            )

        if args.pg_url:
            colophon_xhtml = colophon_xhtml.replace("PG_URL", args.pg_url)

            if pg_publication_year:
                colophon_xhtml = colophon_xhtml.replace(
                    "PG_YEAR", pg_publication_year)

            if pg_producers:
                producers_xhtml = ""
                for i, producer in enumerate(pg_producers):
                    if "Distributed Proofread" in producer:
                        producers_xhtml = producers_xhtml + "<a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a>"
                    elif "anonymous" in producer.lower():
                        producers_xhtml = producers_xhtml + "<b class=\"name\">An Anonymous Volunteer</b>"
                    else:
                        producers_xhtml = producers_xhtml + f"<b class=\"name\">{_add_name_abbr(producer).strip('.')}</b>"

                    if i < len(pg_producers) - 1:
                        producers_xhtml = producers_xhtml + ", "

                    if i == len(pg_producers) - 2:
                        producers_xhtml = producers_xhtml + "and "

                producers_xhtml = producers_xhtml + "<br/>"

                colophon_xhtml = colophon_xhtml.replace(
                    "<b class=\"name\">TRANSCRIBER_1</b>, <b class=\"name\">TRANSCRIBER_2</b>, and <a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a><br/>",
                    producers_xhtml)

        file.seek(0)
        file.write(colophon_xhtml)
        file.truncate()

    # Fill out the metadata file
    with open(repo_path / "src" / "epub" / "content.opf",
              "r+",
              encoding="utf-8") as file:
        metadata_xml = file.read()

        metadata_xml = metadata_xml.replace("SE_IDENTIFIER", identifier)
        metadata_xml = metadata_xml.replace(">TITLE_SORT<",
                                            f">{sorted_title}<")
        metadata_xml = metadata_xml.replace(">TITLE<", f">{title}<")
        metadata_xml = metadata_xml.replace("VCS_IDENTIFIER", str(repo_name))

        if pg_producers:
            producers_xhtml = ""
            i = 1
            for producer in pg_producers:
                if "Distributed Proofread" in producer:
                    producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">The Online Distributed Proofreading Team</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Online Distributed Proofreading Team, The</meta>\n\t\t<meta property=\"se:url.homepage\" refines=\"#transcriber-{i}\">https://pgdp.net</meta>\n"
                elif "anonymous" in producer.lower():
                    producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">An Anonymous Volunteer</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Anonymous Volunteer, An</meta>\n"
                else:
                    producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">{producer.strip('.')}</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">TRANSCRIBER_SORT</meta>\n"

                producers_xhtml = producers_xhtml + f"\t\t<meta property=\"role\" refines=\"#transcriber-{i}\" scheme=\"marc:relators\">trc</meta>\n"

                i = i + 1

            metadata_xml = regex.sub(
                r"\t\t<dc:contributor id=\"transcriber-1\">TRANSCRIBER</dc:contributor>\s*<meta property=\"file-as\" refines=\"#transcriber-1\">TRANSCRIBER_SORT</meta>\s*<meta property=\"se:url.homepage\" refines=\"#transcriber-1\">TRANSCRIBER_URL</meta>\s*<meta property=\"role\" refines=\"#transcriber-1\" scheme=\"marc:relators\">trc</meta>",
                "\t\t" + producers_xhtml.strip(),
                metadata_xml,
                flags=regex.DOTALL)

        if ebook_wiki_url:
            metadata_xml = metadata_xml.replace(">EBOOK_WIKI_URL<",
                                                f">{ebook_wiki_url}<")

        authors_xml = _generate_metadata_contributor_xml(authors, "author")
        authors_xml = authors_xml.replace("dc:contributor", "dc:creator")
        metadata_xml = regex.sub(
            r"<dc:creator id=\"author\">AUTHOR</dc:creator>.+?scheme=\"marc:relators\">aut</meta>",
            authors_xml,
            metadata_xml,
            flags=regex.DOTALL)

        if translators:
            translators_xml = _generate_metadata_contributor_xml(
                translators, "translator")
            metadata_xml = regex.sub(
                r"<dc:contributor id=\"translator\">.+?scheme=\"marc:relators\">trl</meta>",
                translators_xml,
                metadata_xml,
                flags=regex.DOTALL)
        else:
            metadata_xml = regex.sub(
                r"<dc:contributor id=\"translator\">.+?scheme=\"marc:relators\">trl</meta>\n\t\t",
                "",
                metadata_xml,
                flags=regex.DOTALL)

        if illustrators:
            illustrators_xml = _generate_metadata_contributor_xml(
                illustrators, "illustrator")
            metadata_xml = regex.sub(
                r"<dc:contributor id=\"illustrator\">.+?scheme=\"marc:relators\">ill</meta>",
                illustrators_xml,
                metadata_xml,
                flags=regex.DOTALL)
        else:
            metadata_xml = regex.sub(
                r"<dc:contributor id=\"illustrator\">.+?scheme=\"marc:relators\">ill</meta>\n\t\t",
                "",
                metadata_xml,
                flags=regex.DOTALL)

        if args.pg_url:
            if pg_subjects:
                subject_xhtml = ""

                i = 1
                for subject in pg_subjects:
                    subject_xhtml = subject_xhtml + f"\t\t<dc:subject id=\"subject-{i}\">{subject}</dc:subject>\n"
                    i = i + 1

                i = 1
                for subject in pg_subjects:
                    subject_xhtml = subject_xhtml + f"\t\t<meta property=\"authority\" refines=\"#subject-{i}\">LCSH</meta>\n"

                    # Now, get the LCSH ID by querying LCSH directly.
                    try:
                        response = requests.get(
                            f"https://id.loc.gov/search/?q=%22{urllib.parse.quote(subject)}%22"
                        )
                        result = regex.search(
                            fr"<a title=\"Click to view record\" href=\"/authorities/subjects/([^\"]+?)\">{regex.escape(subject.replace(' -- ', '--'))}</a>",
                            response.text)

                        loc_id = "Unknown"
                        try:
                            loc_id = result.group(1)
                        except Exception as ex:
                            pass

                        subject_xhtml = subject_xhtml + f"\t\t<meta property=\"term\" refines=\"#subject-{i}\">{loc_id}</meta>\n"

                    except Exception as ex:
                        raise se.RemoteCommandErrorException(
                            f"Couldn’t connect to [url][link=https://id.loc.gov]https://id.loc.gov[/][/]. Exception: {ex}"
                        )

                    i = i + 1

                metadata_xml = regex.sub(
                    r"\t\t<dc:subject id=\"subject-1\">SUBJECT_1</dc:subject>\s*<dc:subject id=\"subject-2\">SUBJECT_2</dc:subject>\s*<meta property=\"authority\" refines=\"#subject-1\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-1\">LCSH_ID_1</meta>\s*<meta property=\"authority\" refines=\"#subject-2\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-2\">LCSH_ID_2</meta>",
                    "\t\t" + subject_xhtml.strip(), metadata_xml)

            metadata_xml = metadata_xml.replace(
                "<dc:language>LANG</dc:language>",
                f"<dc:language>{pg_language}</dc:language>")
            metadata_xml = metadata_xml.replace(
                "<dc:source>PG_URL</dc:source>",
                f"<dc:source>{args.pg_url}</dc:source>")

        file.seek(0)
        file.write(metadata_xml)
        file.truncate()

    # Set up local git repo
    repo = git.Repo.init(repo_path)

    if args.email:
        with repo.config_writer() as config:
            config.set_value("user", "email", args.email)

    if args.pg_url and pg_ebook_html and not is_pg_html_parsed:
        raise se.InvalidXhtmlException(
            "Couldn’t parse Project Gutenberg ebook source. This is usually due to invalid HTML in the ebook."
        )
Пример #3
0
def create_draft(args: list):
    """
	Entry point for `se create-draft`
	"""

    # Put together some variables for later use
    identifier = se.formatting.make_url_safe(
        args.author) + "/" + se.formatting.make_url_safe(args.title)
    title_string = args.title.replace(
        "'", "’") + ", by " + args.author.replace("'", "’")
    sorted_title = regex.sub(r"^(A|An|The) (.+)$", "\\2, \\1", args.title)
    pg_producers = []

    if args.translator:
        identifier = identifier + "/" + se.formatting.make_url_safe(
            args.translator)
        title_string = title_string + ". Translated by " + args.translator

    if args.illustrator:
        identifier = identifier + "/" + se.formatting.make_url_safe(
            args.illustrator)
        title_string = title_string + ". Illustrated by " + args.illustrator

    repo_name = Path(identifier.replace("/", "_"))

    if repo_name.is_dir():
        raise se.InvalidInputException(
            "./{}/ already exists.".format(repo_name))

    # Download PG HTML and do some fixups
    if args.pg_url:
        args.pg_url = args.pg_url.replace("http://", "https://")

        # Get the ebook metadata
        try:
            response = requests.get(args.pg_url)
            pg_metadata_html = response.text
        except Exception as ex:
            raise se.RemoteCommandErrorException(
                "Couldn’t download Project Gutenberg ebook metadata page. Error: {}"
                .format(ex))

        soup = BeautifulSoup(pg_metadata_html, "lxml")

        # Get the ebook HTML URL from the metadata
        pg_ebook_url = None
        for element in soup.select("a[type^=\"text/html\"]"):
            pg_ebook_url = regex.sub(r"^//", "https://", element["href"])
            pg_ebook_url = regex.sub(r"^/", "https://www.gutenberg.org/",
                                     pg_ebook_url)

        if not pg_ebook_url:
            raise se.RemoteCommandErrorException(
                "Could download ebook metadata, but couldn’t find URL for the ebook HTML."
            )

        # Get the ebook LCSH categories
        pg_subjects = []
        for element in soup.select("td[property=\"dcterms:subject\"]"):
            if element["datatype"] == "dcterms:LCSH":
                for subject_link in element.find("a"):
                    pg_subjects.append(subject_link.strip())

        # Get the PG publication date
        pg_publication_year = None
        for element in soup.select("td[itemprop=\"datePublished\"]"):
            pg_publication_year = regex.sub(r".+?([0-9]{4})", "\\1",
                                            element.text)

        # Get the actual ebook URL
        try:
            response = requests.get(pg_ebook_url)
            pg_ebook_html = response.text
        except Exception as ex:
            raise se.RemoteCommandErrorException(
                "Couldn’t download Project Gutenberg ebook HTML. Error: {}".
                format(ex))

        try:
            fixed_pg_ebook_html = fix_text(pg_ebook_html, uncurl_quotes=False)
            pg_ebook_html = se.strip_bom(fixed_pg_ebook_html)
        except Exception as ex:
            raise se.InvalidEncodingException(
                "Couldn’t determine text encoding of Project Gutenberg HTML file. Error: {}"
                .format(ex))

        # Try to guess the ebook language
        pg_language = "en-US"
        if "colour" in pg_ebook_html or "favour" in pg_ebook_html or "honour" in pg_ebook_html:
            pg_language = "en-GB"

    # Create necessary directories
    (repo_name / "images").mkdir(parents=True)
    (repo_name / "src" / "epub" / "css").mkdir(parents=True)
    (repo_name / "src" / "epub" / "images").mkdir(parents=True)
    (repo_name / "src" / "epub" / "text").mkdir(parents=True)
    (repo_name / "src" / "META-INF").mkdir(parents=True)

    # Write PG data if we have it
    if args.pg_url and pg_ebook_html:
        try:
            soup = BeautifulSoup(pg_ebook_html, "html.parser")

            # Try to get the PG producers.  We only try this if there's a <pre> block with the header info (which is not always the case)
            for element in soup(text=regex.compile(r"\*\*\*\s*Produced by.+$",
                                                   flags=regex.DOTALL)):
                if element.parent.name == "pre":
                    pg_producers = regex.sub(r".+?Produced by (.+?)\s*$",
                                             "\\1",
                                             element,
                                             flags=regex.DOTALL)
                    pg_producers = regex.sub(r"\(.+?\)",
                                             "",
                                             pg_producers,
                                             flags=regex.DOTALL)
                    pg_producers = regex.sub(r"(at )?https?://www\.pgdp\.net",
                                             "",
                                             pg_producers,
                                             flags=regex.DOTALL)
                    pg_producers = regex.sub(r"[\r\n]+",
                                             " ",
                                             pg_producers,
                                             flags=regex.DOTALL)
                    pg_producers = regex.sub(r",? and ", ", and ",
                                             pg_producers)
                    pg_producers = pg_producers.replace(
                        " and the Online", " and The Online")
                    pg_producers = pg_producers.replace(
                        ", and ", ", ").strip().split(", ")

            # Try to strip out the PG header
            for element in soup(text=regex.compile(r"\*\*\*\s*START OF THIS")):
                for sibling in element.parent.find_previous_siblings():
                    sibling.decompose()

                element.parent.decompose()

            # Try to strip out the PG license footer
            for element in soup(
                    text=regex.compile(r"End of (the )?Project Gutenberg")):
                for sibling in element.parent.find_next_siblings():
                    sibling.decompose()

                element.parent.decompose()

            with open(repo_name / "src" / "epub" / "text" / "body.xhtml",
                      "w",
                      encoding="utf-8") as file:
                file.write(str(soup))
        except IOError as ex:
            raise se.InvalidFileException(
                "Couldn’t write to ebook directory. Error: {}".format(ex))
        except:
            raise se.InvalidInputException(
                "Couldn’t parse Project Gutenberg ebook source. This is usually due to invalid HTML in the ebook."
            )

    # Copy over templates
    shutil.copy(
        resource_filename("se", str(Path("data") / "templates" / "gitignore")),
        repo_name / ".gitignore")
    shutil.copy(
        resource_filename("se",
                          str(Path("data") / "templates" / "LICENSE.md")),
        repo_name)
    shutil.copy(
        resource_filename(
            "se",
            str(Path("data") / "templates" / "META-INF" / "container.xml")),
        repo_name / "src" / "META-INF")
    shutil.copy(
        resource_filename("se", str(Path("data") / "templates" / "mimetype")),
        repo_name / "src")
    shutil.copy(
        resource_filename("se",
                          str(Path("data") / "templates" / "content.opf")),
        repo_name / "src" / "epub")
    shutil.copy(
        resource_filename("se", str(Path("data") / "templates" / "onix.xml")),
        repo_name / "src" / "epub")
    shutil.copy(
        resource_filename("se", str(Path("data") / "templates" / "toc.xhtml")),
        repo_name / "src" / "epub")
    shutil.copy(
        resource_filename("se", str(Path("data") / "templates" / "core.css")),
        repo_name / "src" / "epub" / "css")
    shutil.copy(
        resource_filename("se", str(Path("data") / "templates" / "local.css")),
        repo_name / "src" / "epub" / "css")
    shutil.copy(
        resource_filename("se", str(Path("data") / "templates" / "logo.svg")),
        repo_name / "src" / "epub" / "images")
    shutil.copy(
        resource_filename("se",
                          str(Path("data") / "templates" / "colophon.xhtml")),
        repo_name / "src" / "epub" / "text")
    shutil.copy(
        resource_filename("se",
                          str(Path("data") / "templates" / "imprint.xhtml")),
        repo_name / "src" / "epub" / "text")
    shutil.copy(
        resource_filename("se",
                          str(Path("data") / "templates" / "titlepage.xhtml")),
        repo_name / "src" / "epub" / "text")
    shutil.copy(
        resource_filename(
            "se", str(Path("data") / "templates" / "uncopyright.xhtml")),
        repo_name / "src" / "epub" / "text")
    shutil.copy(
        resource_filename("se",
                          str(Path("data") / "templates" / "titlepage.svg")),
        repo_name / "images")
    shutil.copy(
        resource_filename("se", str(Path("data") / "templates" / "cover.jpg")),
        repo_name / "images" / "cover.jpg")
    shutil.copy(
        resource_filename("se", str(Path("data") / "templates" / "cover.svg")),
        repo_name / "images" / "cover.svg")

    # Try to find Wikipedia links if possible
    author_wiki_url, author_nacoaf_url = _get_wikipedia_url(args.author, True)
    ebook_wiki_url, _ = _get_wikipedia_url(args.title, False)
    translator_wiki_url = None
    if args.translator:
        translator_wiki_url, translator_nacoaf_url = _get_wikipedia_url(
            args.translator, True)

    # Pre-fill a few templates
    se.replace_in_file(repo_name / "src" / "epub" / "text" / "titlepage.xhtml",
                       "TITLE_STRING", title_string)
    se.replace_in_file(repo_name / "images" / "titlepage.svg", "TITLE_STRING",
                       title_string)
    se.replace_in_file(repo_name / "images" / "cover.svg", "TITLE_STRING",
                       title_string)

    # Create the titlepage SVG
    contributors = {}
    if args.translator:
        contributors["translated by"] = args.translator

    if args.illustrator:
        contributors["illustrated by"] = args.illustrator

    with open(repo_name / "images" / "titlepage.svg", "w",
              encoding="utf-8") as file:
        file.write(
            _generate_titlepage_svg(args.title, args.author, contributors,
                                    title_string))

    # Create the cover SVG
    with open(repo_name / "images" / "cover.svg", "w",
              encoding="utf-8") as file:
        file.write(_generate_cover_svg(args.title, args.author, title_string))

    if args.pg_url:
        se.replace_in_file(
            repo_name / "src" / "epub" / "text" / "imprint.xhtml", "PG_URL",
            args.pg_url)

    with open(repo_name / "src" / "epub" / "text" / "colophon.xhtml",
              "r+",
              encoding="utf-8") as file:
        colophon_xhtml = file.read()

        colophon_xhtml = colophon_xhtml.replace("SE_IDENTIFIER", identifier)
        colophon_xhtml = colophon_xhtml.replace(">AUTHOR<",
                                                ">{}<".format(args.author))
        colophon_xhtml = colophon_xhtml.replace("TITLE", args.title)

        if author_wiki_url:
            colophon_xhtml = colophon_xhtml.replace("AUTHOR_WIKI_URL",
                                                    author_wiki_url)

        if args.pg_url:
            colophon_xhtml = colophon_xhtml.replace("PG_URL", args.pg_url)

            if pg_publication_year:
                colophon_xhtml = colophon_xhtml.replace(
                    "PG_YEAR", pg_publication_year)

            if pg_producers:
                producers_xhtml = ""
                for i, producer in enumerate(pg_producers):
                    if "Distributed Proofreading" in producer:
                        producers_xhtml = producers_xhtml + "<a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a>"
                    else:
                        producers_xhtml = producers_xhtml + "<b class=\"name\">{}</b>".format(
                            producer)

                    if i < len(pg_producers) - 1:
                        producers_xhtml = producers_xhtml + ", "

                    if i == len(pg_producers) - 2:
                        producers_xhtml = producers_xhtml + "and "

                producers_xhtml = producers_xhtml + "<br/>"

                colophon_xhtml = colophon_xhtml.replace(
                    "<b class=\"name\">TRANSCRIBER_1</b>, <b class=\"name\">TRANSCRIBER_2</b>, and <a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a><br/>",
                    producers_xhtml)

        file.seek(0)
        file.write(colophon_xhtml)
        file.truncate()

    with open(repo_name / "src" / "epub" / "content.opf",
              "r+",
              encoding="utf-8") as file:
        metadata_xhtml = file.read()

        metadata_xhtml = metadata_xhtml.replace("SE_IDENTIFIER", identifier)
        metadata_xhtml = metadata_xhtml.replace(">AUTHOR<",
                                                ">{}<".format(args.author))
        metadata_xhtml = metadata_xhtml.replace(">TITLE_SORT<",
                                                ">{}<".format(sorted_title))
        metadata_xhtml = metadata_xhtml.replace(">TITLE<",
                                                ">{}<".format(args.title))
        metadata_xhtml = metadata_xhtml.replace("VCS_IDENTIFIER",
                                                str(repo_name))

        if pg_producers:
            producers_xhtml = ""
            i = 1
            for producer in pg_producers:
                producers_xhtml = producers_xhtml + "\t\t<dc:contributor id=\"transcriber-{}\">{}</dc:contributor>\n".format(
                    i, producer)

                if "Distributed Proofreading" in producer:
                    producers_xhtml = producers_xhtml + "\t\t<meta property=\"file-as\" refines=\"#transcriber-{0}\">Online Distributed Proofreading Team, The</meta>\n\t\t<meta property=\"se:url.homepage\" refines=\"#transcriber-{0}\">https://pgdp.net</meta>\n".format(
                        i)
                else:
                    producers_xhtml = producers_xhtml + "\t\t<meta property=\"file-as\" refines=\"#transcriber-{}\">TRANSCRIBER_SORT</meta>\n".format(
                        i)

                producers_xhtml = producers_xhtml + "\t\t<meta property=\"role\" refines=\"#transcriber-{}\" scheme=\"marc:relators\">trc</meta>\n".format(
                    i)

                i = i + 1

            metadata_xhtml = regex.sub(
                r"\t\t<dc:contributor id=\"transcriber-1\">TRANSCRIBER</dc:contributor>\s*<meta property=\"file-as\" refines=\"#transcriber-1\">TRANSCRIBER_SORT</meta>\s*<meta property=\"se:url.homepage\" refines=\"#transcriber-1\">TRANSCRIBER_URL</meta>\s*<meta property=\"role\" refines=\"#transcriber-1\" scheme=\"marc:relators\">trc</meta>",
                "\t\t" + producers_xhtml.strip(),
                metadata_xhtml,
                flags=regex.DOTALL)

        if author_wiki_url:
            metadata_xhtml = metadata_xhtml.replace(
                ">AUTHOR_WIKI_URL<", ">{}<".format(author_wiki_url))

        if author_nacoaf_url:
            metadata_xhtml = metadata_xhtml.replace(
                ">AUTHOR_NACOAF_URL<", ">{}<".format(author_nacoaf_url))

        if ebook_wiki_url:
            metadata_xhtml = metadata_xhtml.replace(
                ">EBOOK_WIKI_URL<", ">{}<".format(ebook_wiki_url))

        if args.translator:
            metadata_xhtml = metadata_xhtml.replace(
                ">TRANSLATOR<", ">{}<".format(args.translator))

            if translator_wiki_url:
                metadata_xhtml = metadata_xhtml.replace(
                    ">TRANSLATOR_WIKI_URL<",
                    ">{}<".format(translator_wiki_url))

            if translator_nacoaf_url:
                metadata_xhtml = metadata_xhtml.replace(
                    ">TRANSLATOR_NACOAF_URL<",
                    ">{}<".format(translator_nacoaf_url))
        else:
            metadata_xhtml = regex.sub(
                r"<dc:contributor id=\"translator\">.+?<dc:contributor id=\"artist\">",
                "<dc:contributor id=\"artist\">",
                metadata_xhtml,
                flags=regex.DOTALL)

        if args.pg_url:
            if pg_subjects:
                subject_xhtml = ""

                i = 1
                for subject in pg_subjects:
                    subject_xhtml = subject_xhtml + "\t\t<dc:subject id=\"subject-{}\">{}</dc:subject>\n".format(
                        i, subject)
                    i = i + 1

                i = 1
                for subject in pg_subjects:
                    subject_xhtml = subject_xhtml + "\t\t<meta property=\"authority\" refines=\"#subject-{}\">LCSH</meta>\n".format(
                        i)

                    # Now, get the LCSH ID by querying LCSH directly.
                    try:
                        response = requests.get(
                            "http://id.loc.gov/search/?q=%22{}%22".format(
                                urllib.parse.quote(subject)))
                        result = regex.search(
                            r"<a title=\"Click to view record\" href=\"/authorities/subjects/([^\"]+?)\">{}</a>"
                            .format(regex.escape(subject.replace(" -- ",
                                                                 "--"))),
                            response.text)

                        loc_id = "Unknown"
                        try:
                            loc_id = result.group(1)
                        except Exception as ex:
                            pass

                        subject_xhtml = subject_xhtml + "\t\t<meta property=\"term\" refines=\"#subject-{}\">{}</meta>\n".format(
                            i, loc_id)

                    except Exception as ex:
                        raise se.RemoteCommandErrorException(
                            "Couldn’t connect to id.loc.gov. Error: {}".format(
                                ex))

                    i = i + 1

                metadata_xhtml = regex.sub(
                    r"\t\t<dc:subject id=\"subject-1\">SUBJECT_1</dc:subject>\s*<dc:subject id=\"subject-2\">SUBJECT_2</dc:subject>\s*<meta property=\"authority\" refines=\"#subject-1\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-1\">LCSH_ID_1</meta>\s*<meta property=\"authority\" refines=\"#subject-2\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-2\">LCSH_ID_2</meta>",
                    "\t\t" + subject_xhtml.strip(), metadata_xhtml)

            metadata_xhtml = metadata_xhtml.replace(
                "<dc:language>LANG</dc:language>",
                "<dc:language>{}</dc:language>".format(pg_language))
            metadata_xhtml = metadata_xhtml.replace(
                "<dc:source>PG_URL</dc:source>",
                "<dc:source>{}</dc:source>".format(args.pg_url))

        file.seek(0)
        file.write(metadata_xhtml)
        file.truncate()

    # Set up local git repo
    repo = git.Repo.init(repo_name)

    if args.email:
        with repo.config_writer() as config:
            config.set_value("user", "email", args.email)

    # Set up remote git repos
    if args.create_se_repo:
        git_command = git.cmd.Git(repo_name)
        git_command.remote(
            "add", "origin",
            "standardebooks.org:/standardebooks.org/ebooks/{}.git".format(
                repo_name))

        # Set git to automatically push to SE
        git_command.config("branch.master.remote", "origin")
        git_command.config("branch.master.merge", "refs/heads/master")

        github_option = ""
        if args.create_github_repo:
            github_option = "--github"

        return_code = call([
            "ssh", "standardebooks.org",
            "/standardebooks.org/scripts/init-se-repo --repo-name={} --title-string=\"{}\" {}"
            .format(repo_name, title_string, github_option)
        ])
        if return_code != 0:
            raise se.RemoteCommandErrorException(
                "Failed to create repository on Standard Ebooks server: ssh returned code {}."
                .format(return_code))