def _endnotes_dom(self) -> se.easy_xml.EasyXhtmlTree: """ Accessor Return an EasyXmlTree object representing the endnotes.xhtml file for this ebook. INPUTS None OUTPUTS A EasyXmlTree object representing the endnotes.xhtml file for this ebook. """ if not self.__endnotes_dom: try: with open(self.path / "src" / "epub" / "text" / "endnotes.xhtml") as file: self.__endnotes_dom = se.formatting.EasyXhtmlTree( file.read()) except Exception as ex: raise se.InvalidFileException( f"Could't open file: [path][link=file://{self.path / 'src' / 'epub' / 'text' / 'endnotes.xhtml'}]{self.path / 'src' / 'epub' / 'text' / 'endnotes.xhtml'}[/][/]." ) from ex return self.__endnotes_dom
def _endnotes_soup(self) -> BeautifulSoup: """ Accessor Return a BeautifulSoup object representing the endnotes.xhtml file for this ebook. INPUTS None OUTPUTS A BeautifulSoup object representing the endnotes.xhtml file for this ebook. """ if not self.__endnotes_soup: try: with open(self.path / "src" / "epub" / "text" / "endnotes.xhtml") as file: self.__endnotes_soup = BeautifulSoup( file.read(), "html.parser") except: raise se.InvalidFileException("Could't open file: {}".format( str(self.path / "src" / "epub" / "text" / "endnotes.xhtml"))) return self.__endnotes_soup
def _endnotes_soup(self) -> BeautifulSoup: """ Accessor Return a BeautifulSoup object representing the endnotes.xhtml file for this ebook. INPUTS None OUTPUTS A BeautifulSoup object representing the endnotes.xhtml file for this ebook. """ if not self.__endnotes_soup: try: with open(self.path / "src" / "epub" / "text" / "endnotes.xhtml") as file: self.__endnotes_soup = BeautifulSoup( file.read(), "html.parser") except: raise se.InvalidFileException( f"Could't open file: [path][link=file://{self.path / 'src' / 'epub' / 'text' / 'endnotes.xhtml'}]{self.path / 'src' / 'epub' / 'text' / 'endnotes.xhtml'}[/][/]." ) return self.__endnotes_soup
def process_all_content(file_list: list, text_path: str) -> Tuple[list, list]: """ Analyze the whole content of the project, build and return lists if toc_items and landmarks. INPUTS: file_list: a list of all content files text_path: the path to the contents folder (src/epub/text) OUTPUTS: a tuple containing the list of Toc items and the list of landmark items """ toc_list: List[TocItem] = [] landmarks: List[TocItem] = [] # We make two passes through the work, because we need to know # how many bodymatter items there are. So we do landmarks first. for textf in file_list: file_path = Path(text_path) / textf try: with open(file_path, encoding="utf8") as file: dom = se.easy_xml.EasyXhtmlTree(file.read()) except Exception as ex: raise se.InvalidFileException( f"Couldn’t open file: [path][link=file://{file_path}]{file_path}[/][/]. Exception: {ex}" ) from ex add_landmark(dom, textf, landmarks) # Now we test to see if there is only one body item body_items = [item for item in landmarks if item.place == Position.BODY] single_file = (len(body_items) == 1) nest_under_halftitle = False place = Position.NONE for textf in file_list: with open(Path(text_path) / textf, "r", encoding="utf-8") as file: dom = se.easy_xml.EasyXhtmlTree(file.read()) body = dom.xpath("//body") if body: place = get_place(body[0]) else: raise se.InvalidInputException("Couldn't locate body node") if place == Position.BACK: nest_under_halftitle = False process_headings(dom, textf, toc_list, nest_under_halftitle, single_file) if textf == "halftitle.xhtml": nest_under_halftitle = True # We add this dummy item because outputtoc always needs to look ahead to the next item. last_toc = TocItem() last_toc.level = 1 last_toc.title = "dummy" toc_list.append(last_toc) return landmarks, toc_list
def has_transparency(filename: Path) -> bool: """ Return True if the given image file has transparency """ try: image = Image.open(filename) except UnidentifiedImageError as ex: raise se.InvalidFileException( f"Couldn’t identify image type of [path][link=file://{filename.resolve()}]{filename}[/]." ) from ex if image.mode == "P": transparent = image.info.get("transparency", -1) for _, index in image.getcolors(): if index == transparent: return True elif image.mode == "RGBA": extrema = image.getextrema() if extrema[3][0] < 255: return True return False
def get_file(self, file_path: Path) -> str: """ Get raw file contents of a file in the epub. Contents are cached so that we don't hit the disk repeatedly INPUTS file_path: A Path pointing to the file OUTPUTS A string representing the file contents """ file_path_str = str(file_path) if file_path_str not in self._file_cache: try: with open(file_path, "r", encoding="utf-8") as file: file_contents = file.read() except Exception as ex: raise se.InvalidFileException(f"Couldn’t read file: [path]{file_path_str}[/]") from ex self._file_cache[file_path_str] = file_contents return self._file_cache[file_path_str]
def _create_draft(args: Namespace): """ Implementation for `se create-draft` """ # Put together some variables for later use authors = [] translators = [] illustrators = [] pg_producers = [] title = args.title.replace("'", "’") for author in args.author: authors.append({ "name": author.replace("'", "’"), "wiki_url": None, "nacoaf_url": None }) if args.translator: for translator in args.translator: translators.append({ "name": translator.replace("'", "’"), "wiki_url": None, "nacoaf_url": None }) if args.illustrator: for illustrator in args.illustrator: illustrators.append({ "name": illustrator.replace("'", "’"), "wiki_url": None, "nacoaf_url": None }) title_string = title if authors and authors[0]["name"].lower() != "anonymous": title_string += ", by " + _generate_contributor_string(authors, False) identifier = "" for author in authors: identifier += se.formatting.make_url_safe(author["name"]) + "_" identifier = identifier.rstrip("_") + "/" + se.formatting.make_url_safe( title) sorted_title = regex.sub(r"^(A|An|The) (.+)$", "\\2, \\1", title) if translators: title_string = title_string + ". Translated by " + _generate_contributor_string( translators, False) identifier = identifier + "/" for translator in translators: identifier += se.formatting.make_url_safe(translator["name"]) + "_" identifier = identifier.rstrip("_") if illustrators: title_string = title_string + ". Illustrated by " + _generate_contributor_string( illustrators, False) identifier = identifier + "/" for illustrator in illustrators: identifier += se.formatting.make_url_safe( illustrator["name"]) + "_" identifier = identifier.rstrip("_") repo_name = identifier.replace("/", "_") repo_path = Path(repo_name).resolve() if repo_path.is_dir(): raise se.InvalidInputException( f"Directory already exists: [path][link=file://{repo_path}]{repo_path}[/][/]." ) # Get data on authors for i, author in enumerate(authors): if not args.offline and author["name"].lower() != "anonymous": author["wiki_url"], author["nacoaf_url"] = _get_wikipedia_url( author["name"], True) # Get data on translators for i, translator in enumerate(translators): if not args.offline and translator["name"].lower() != "anonymous": translator["wiki_url"], translator[ "nacoaf_url"] = _get_wikipedia_url(translator["name"], True) # Get data on illlustrators for i, illustrator in enumerate(illustrators): if not args.offline and illustrator["name"].lower() != "anonymous": illustrator["wiki_url"], illustrator[ "nacoaf_url"] = _get_wikipedia_url(illustrator["name"], True) # Download PG HTML and do some fixups if args.pg_url: if args.offline: raise se.RemoteCommandErrorException( "Cannot download Project Gutenberg ebook when offline option is enabled." ) args.pg_url = args.pg_url.replace("http://", "https://") # Get the ebook metadata try: response = requests.get(args.pg_url) pg_metadata_html = response.text except Exception as ex: raise se.RemoteCommandErrorException( f"Couldn’t download Project Gutenberg ebook metadata page. Exception: {ex}" ) parser = etree.HTMLParser() dom = etree.parse(StringIO(pg_metadata_html), parser) # Get the ebook HTML URL from the metadata pg_ebook_url = None for node in dom.xpath("/html/body//a[contains(@type, 'text/html')]"): pg_ebook_url = regex.sub(r"^//", "https://", node.get("href")) pg_ebook_url = regex.sub(r"^/", "https://www.gutenberg.org/", pg_ebook_url) if not pg_ebook_url: raise se.RemoteCommandErrorException( "Could download ebook metadata, but couldn’t find URL for the ebook HTML." ) # Get the ebook LCSH categories pg_subjects = [] for node in dom.xpath( "/html/body//td[contains(@property, 'dcterms:subject')]"): if node.get("datatype") == "dcterms:LCSH": for subject_link in node.xpath("./a"): pg_subjects.append(subject_link.text.strip()) # Get the PG publication date pg_publication_year = None for node in dom.xpath("//td[@itemprop='datePublished']"): pg_publication_year = regex.sub(r".+?([0-9]{4})", "\\1", node.text) # Get the actual ebook URL try: response = requests.get(pg_ebook_url) pg_ebook_html = response.text except Exception as ex: raise se.RemoteCommandErrorException( f"Couldn’t download Project Gutenberg ebook HTML. Exception: {ex}" ) try: fixed_pg_ebook_html = fix_text(pg_ebook_html, uncurl_quotes=False) pg_ebook_html = se.strip_bom(fixed_pg_ebook_html) except Exception as ex: raise se.InvalidEncodingException( f"Couldn’t determine text encoding of Project Gutenberg HTML file. Exception: {ex}" ) # Try to guess the ebook language pg_language = "en-US" if "colour" in pg_ebook_html or "favour" in pg_ebook_html or "honour" in pg_ebook_html: pg_language = "en-GB" # Create necessary directories (repo_path / "images").mkdir(parents=True) (repo_path / "src" / "epub" / "css").mkdir(parents=True) (repo_path / "src" / "epub" / "images").mkdir(parents=True) (repo_path / "src" / "epub" / "text").mkdir(parents=True) (repo_path / "src" / "META-INF").mkdir(parents=True) is_pg_html_parsed = True # Write PG data if we have it if args.pg_url and pg_ebook_html: try: dom = etree.parse( StringIO(regex.sub(r"encoding=\".+?\"", "", pg_ebook_html)), parser) namespaces = {"re": "http://exslt.org/regular-expressions"} for node in dom.xpath( "//*[re:test(text(), '\\*\\*\\*\\s*Produced by.+')]", namespaces=namespaces): producers_text = regex.sub( r"^<[^>]+?>", "", etree.tostring(node, encoding=str, with_tail=False)) producers_text = regex.sub(r"<[^>]+?>$", "", producers_text) producers_text = regex.sub(r".+?Produced by (.+?)\s*$", "\\1", producers_text, flags=regex.DOTALL) producers_text = regex.sub(r"\(.+?\)", "", producers_text, flags=regex.DOTALL) producers_text = regex.sub(r"(at )?https?://www\.pgdp\.net", "", producers_text, flags=regex.DOTALL) producers_text = regex.sub(r"[\r\n]+", " ", producers_text, flags=regex.DOTALL) producers_text = regex.sub(r",? and ", ", and ", producers_text) producers_text = producers_text.replace( " and the Online", " and The Online") producers_text = producers_text.replace(", and ", ", ").strip() pg_producers = [ producer.strip() for producer in regex.split(',|;', producers_text) ] # Try to strip out the PG header for node in dom.xpath( "//*[re:test(text(), '\\*\\*\\*\\s*START OF THIS')]", namespaces=namespaces): for sibling_node in node.xpath("./preceding-sibling::*"): easy_node = se.easy_xml.EasyXmlElement(sibling_node) easy_node.remove() easy_node = se.easy_xml.EasyXmlElement(node) easy_node.remove() # Try to strip out the PG license footer for node in dom.xpath( "//*[re:test(text(), 'End of (the )?Project Gutenberg')]", namespaces=namespaces): for sibling_node in node.xpath("./following-sibling::*"): easy_node = se.easy_xml.EasyXmlElement(sibling_node) easy_node.remove() easy_node = se.easy_xml.EasyXmlElement(node) easy_node.remove() # lxml will but the xml declaration in a weird place, remove it first output = regex.sub(r"<\?xml.+?\?>", "", etree.tostring(dom, encoding="unicode")) # Now re-add it output = """<?xml version="1.0" encoding="utf-8"?>\n""" + output # lxml can also output duplicate default namespace declarations so remove the first one only output = regex.sub(r"(xmlns=\".+?\")(\sxmlns=\".+?\")+", r"\1", output) with open(repo_path / "src" / "epub" / "text" / "body.xhtml", "w", encoding="utf-8") as file: file.write(output) except OSError as ex: raise se.InvalidFileException( f"Couldn’t write to ebook directory. Exception: {ex}") except Exception as ex: # Save this error for later, because it's still useful to complete the create-draft process # even if we've failed to parse PG's HTML source. is_pg_html_parsed = False se.quiet_remove(repo_path / "src" / "epub" / "text" / "body.xhtml") # Copy over templates _copy_template_file("gitignore", repo_path / ".gitignore") _copy_template_file("LICENSE.md", repo_path) _copy_template_file("container.xml", repo_path / "src" / "META-INF") _copy_template_file("mimetype", repo_path / "src") _copy_template_file("content.opf", repo_path / "src" / "epub") _copy_template_file("onix.xml", repo_path / "src" / "epub") _copy_template_file("toc.xhtml", repo_path / "src" / "epub") _copy_template_file("core.css", repo_path / "src" / "epub" / "css") _copy_template_file("local.css", repo_path / "src" / "epub" / "css") _copy_template_file("se.css", repo_path / "src" / "epub" / "css") _copy_template_file("logo.svg", repo_path / "src" / "epub" / "images") _copy_template_file("colophon.xhtml", repo_path / "src" / "epub" / "text") _copy_template_file("imprint.xhtml", repo_path / "src" / "epub" / "text") _copy_template_file("titlepage.xhtml", repo_path / "src" / "epub" / "text") _copy_template_file("uncopyright.xhtml", repo_path / "src" / "epub" / "text") _copy_template_file("titlepage.svg", repo_path / "images") _copy_template_file("cover.jpg", repo_path / "images" / "cover.jpg") _copy_template_file("cover.svg", repo_path / "images" / "cover.svg") # Try to find Wikipedia links if possible ebook_wiki_url = None if not args.offline and title != "Short Fiction": # There's a "Short Fiction" Wikipedia article, so make an exception for that case ebook_wiki_url, _ = _get_wikipedia_url(title, False) # Pre-fill a few templates _replace_in_file(repo_path / "src" / "epub" / "text" / "titlepage.xhtml", "TITLE_STRING", title_string) _replace_in_file(repo_path / "images" / "titlepage.svg", "TITLE_STRING", title_string) _replace_in_file(repo_path / "images" / "cover.svg", "TITLE_STRING", title_string) # Create the titlepage SVG contributors = {} if args.translator: contributors["translated by"] = _generate_contributor_string( translators, False) if args.illustrator: contributors["illustrated by"] = _generate_contributor_string( illustrators, False) with open(repo_path / "images" / "titlepage.svg", "w", encoding="utf-8") as file: file.write( _generate_titlepage_svg(title, [author["name"] for author in authors], contributors, title_string)) # Create the cover SVG with open(repo_path / "images" / "cover.svg", "w", encoding="utf-8") as file: file.write( _generate_cover_svg(title, [author["name"] for author in authors], title_string)) # Build the cover/titlepage for distribution epub = SeEpub(repo_path) epub.generate_cover_svg() epub.generate_titlepage_svg() if args.pg_url: _replace_in_file(repo_path / "src" / "epub" / "text" / "imprint.xhtml", "PG_URL", args.pg_url) # Fill out the colophon with open(repo_path / "src" / "epub" / "text" / "colophon.xhtml", "r+", encoding="utf-8") as file: colophon_xhtml = file.read() colophon_xhtml = colophon_xhtml.replace("SE_IDENTIFIER", identifier) colophon_xhtml = colophon_xhtml.replace("TITLE", title) contributor_string = _generate_contributor_string(authors, True) if contributor_string == "": colophon_xhtml = colophon_xhtml.replace( " by<br/>\n\t\t\t<a href=\"AUTHOR_WIKI_URL\">AUTHOR</a>", contributor_string) else: colophon_xhtml = colophon_xhtml.replace( "<a href=\"AUTHOR_WIKI_URL\">AUTHOR</a>", contributor_string) if translators: translator_block = f"It was translated from ORIGINAL_LANGUAGE in TRANSLATION_YEAR by<br/>\n\t\t\t{_generate_contributor_string(translators, True)}.</p>" colophon_xhtml = colophon_xhtml.replace( "</p>\n\t\t\t<p>This ebook was produced for the<br/>", f"<br/>\n\t\t\t{translator_block}\n\t\t\t<p>This ebook was produced for the<br/>" ) if args.pg_url: colophon_xhtml = colophon_xhtml.replace("PG_URL", args.pg_url) if pg_publication_year: colophon_xhtml = colophon_xhtml.replace( "PG_YEAR", pg_publication_year) if pg_producers: producers_xhtml = "" for i, producer in enumerate(pg_producers): if "Distributed Proofread" in producer: producers_xhtml = producers_xhtml + "<a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a>" elif "anonymous" in producer.lower(): producers_xhtml = producers_xhtml + "<b class=\"name\">An Anonymous Volunteer</b>" else: producers_xhtml = producers_xhtml + f"<b class=\"name\">{_add_name_abbr(producer).strip('.')}</b>" if i < len(pg_producers) - 1: producers_xhtml = producers_xhtml + ", " if i == len(pg_producers) - 2: producers_xhtml = producers_xhtml + "and " producers_xhtml = producers_xhtml + "<br/>" colophon_xhtml = colophon_xhtml.replace( "<b class=\"name\">TRANSCRIBER_1</b>, <b class=\"name\">TRANSCRIBER_2</b>, and <a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a><br/>", producers_xhtml) file.seek(0) file.write(colophon_xhtml) file.truncate() # Fill out the metadata file with open(repo_path / "src" / "epub" / "content.opf", "r+", encoding="utf-8") as file: metadata_xml = file.read() metadata_xml = metadata_xml.replace("SE_IDENTIFIER", identifier) metadata_xml = metadata_xml.replace(">TITLE_SORT<", f">{sorted_title}<") metadata_xml = metadata_xml.replace(">TITLE<", f">{title}<") metadata_xml = metadata_xml.replace("VCS_IDENTIFIER", str(repo_name)) if pg_producers: producers_xhtml = "" i = 1 for producer in pg_producers: if "Distributed Proofread" in producer: producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">The Online Distributed Proofreading Team</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Online Distributed Proofreading Team, The</meta>\n\t\t<meta property=\"se:url.homepage\" refines=\"#transcriber-{i}\">https://pgdp.net</meta>\n" elif "anonymous" in producer.lower(): producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">An Anonymous Volunteer</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Anonymous Volunteer, An</meta>\n" else: producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">{producer.strip('.')}</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">TRANSCRIBER_SORT</meta>\n" producers_xhtml = producers_xhtml + f"\t\t<meta property=\"role\" refines=\"#transcriber-{i}\" scheme=\"marc:relators\">trc</meta>\n" i = i + 1 metadata_xml = regex.sub( r"\t\t<dc:contributor id=\"transcriber-1\">TRANSCRIBER</dc:contributor>\s*<meta property=\"file-as\" refines=\"#transcriber-1\">TRANSCRIBER_SORT</meta>\s*<meta property=\"se:url.homepage\" refines=\"#transcriber-1\">TRANSCRIBER_URL</meta>\s*<meta property=\"role\" refines=\"#transcriber-1\" scheme=\"marc:relators\">trc</meta>", "\t\t" + producers_xhtml.strip(), metadata_xml, flags=regex.DOTALL) if ebook_wiki_url: metadata_xml = metadata_xml.replace(">EBOOK_WIKI_URL<", f">{ebook_wiki_url}<") authors_xml = _generate_metadata_contributor_xml(authors, "author") authors_xml = authors_xml.replace("dc:contributor", "dc:creator") metadata_xml = regex.sub( r"<dc:creator id=\"author\">AUTHOR</dc:creator>.+?scheme=\"marc:relators\">aut</meta>", authors_xml, metadata_xml, flags=regex.DOTALL) if translators: translators_xml = _generate_metadata_contributor_xml( translators, "translator") metadata_xml = regex.sub( r"<dc:contributor id=\"translator\">.+?scheme=\"marc:relators\">trl</meta>", translators_xml, metadata_xml, flags=regex.DOTALL) else: metadata_xml = regex.sub( r"<dc:contributor id=\"translator\">.+?scheme=\"marc:relators\">trl</meta>\n\t\t", "", metadata_xml, flags=regex.DOTALL) if illustrators: illustrators_xml = _generate_metadata_contributor_xml( illustrators, "illustrator") metadata_xml = regex.sub( r"<dc:contributor id=\"illustrator\">.+?scheme=\"marc:relators\">ill</meta>", illustrators_xml, metadata_xml, flags=regex.DOTALL) else: metadata_xml = regex.sub( r"<dc:contributor id=\"illustrator\">.+?scheme=\"marc:relators\">ill</meta>\n\t\t", "", metadata_xml, flags=regex.DOTALL) if args.pg_url: if pg_subjects: subject_xhtml = "" i = 1 for subject in pg_subjects: subject_xhtml = subject_xhtml + f"\t\t<dc:subject id=\"subject-{i}\">{subject}</dc:subject>\n" i = i + 1 i = 1 for subject in pg_subjects: subject_xhtml = subject_xhtml + f"\t\t<meta property=\"authority\" refines=\"#subject-{i}\">LCSH</meta>\n" # Now, get the LCSH ID by querying LCSH directly. try: response = requests.get( f"https://id.loc.gov/search/?q=%22{urllib.parse.quote(subject)}%22" ) result = regex.search( fr"<a title=\"Click to view record\" href=\"/authorities/subjects/([^\"]+?)\">{regex.escape(subject.replace(' -- ', '--'))}</a>", response.text) loc_id = "Unknown" try: loc_id = result.group(1) except Exception as ex: pass subject_xhtml = subject_xhtml + f"\t\t<meta property=\"term\" refines=\"#subject-{i}\">{loc_id}</meta>\n" except Exception as ex: raise se.RemoteCommandErrorException( f"Couldn’t connect to [url][link=https://id.loc.gov]https://id.loc.gov[/][/]. Exception: {ex}" ) i = i + 1 metadata_xml = regex.sub( r"\t\t<dc:subject id=\"subject-1\">SUBJECT_1</dc:subject>\s*<dc:subject id=\"subject-2\">SUBJECT_2</dc:subject>\s*<meta property=\"authority\" refines=\"#subject-1\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-1\">LCSH_ID_1</meta>\s*<meta property=\"authority\" refines=\"#subject-2\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-2\">LCSH_ID_2</meta>", "\t\t" + subject_xhtml.strip(), metadata_xml) metadata_xml = metadata_xml.replace( "<dc:language>LANG</dc:language>", f"<dc:language>{pg_language}</dc:language>") metadata_xml = metadata_xml.replace( "<dc:source>PG_URL</dc:source>", f"<dc:source>{args.pg_url}</dc:source>") file.seek(0) file.write(metadata_xml) file.truncate() # Set up local git repo repo = git.Repo.init(repo_path) if args.email: with repo.config_writer() as config: config.set_value("user", "email", args.email) if args.pg_url and pg_ebook_html and not is_pg_html_parsed: raise se.InvalidXhtmlException( "Couldn’t parse Project Gutenberg ebook source. This is usually due to invalid HTML in the ebook." )
def svg_text_to_paths(in_svg: Path, out_svg: Path, remove_style=True) -> None: """ Convert SVG <text> elements into <path> elements, using SVG document's <style> tag and external font files. (These SVG font files are built-in to the SE tools). Resulting SVG file will have no dependency on external fonts. INPUTS in_svg: Path for the SVG file to convert <text> elements. out_svg: Path for where to write the result SVG file, with <path> elements. OUTPUTS None. """ font_paths = [] name_list = {"league_spartan": ["league-spartan-bold.svg"], "sorts_mill_goudy": ["sorts-mill-goudy-italic.svg", "sorts-mill-goudy.svg"]} for font_family, font_names in name_list.items(): for font_name in font_names: with importlib_resources.path(f"se.data.fonts.{font_family}", font_name) as font_path: font_paths.append(font_path) fonts = [] for font_path in font_paths: font = _parse_font(font_path) fonts.append(font) with open(in_svg, "rt") as svg_in_raw: try: xml = etree.fromstring(str.encode(svg_in_raw.read())) except Exception as ex: raise se.InvalidXmlException(f"Couldn’t parse SVG file: [path][link={in_svg.resolve()}]{in_svg}[/][/].") from ex svg_ns = "{http://www.w3.org/2000/svg}" style = xml.find(svg_ns + "style") # Possibly remove style tag if caller wants that def filter_predicate(elem: etree.Element): if remove_style and elem.tag.endswith("style"): return None # Remove <style> tag return elem # Keep all other elements if remove_style: xml = _traverse_element(xml, filter_predicate) for elem in xml.iter(): if elem.tag.endswith("text"): properties = _apply_css(elem, style.text) _get_properties_from_text_elem(properties, elem) _add_font_to_properties(properties, fonts) text = elem.text if not text: raise se.InvalidFileException(f"SVG [xml]<text>[/] element has no content. File: [path][link=file://{in_svg.resolve()}]{in_svg}[/].") elem.tag = "g" # Replace <text> tag with <g> tag for k in elem.attrib.keys(): if k != "class": del elem.attrib[k] elif k == "class" and elem.attrib["class"] != "title-box": # Keep just class attribute if class="title-box" del elem.attrib[k] elem.attrib["aria-label"] = text elem.tail = "\n" elem.text = "" _add_svg_paths_to_group(elem, properties) xmlstr = etree.tostring(xml, pretty_print=True).decode("UTF-8") result_all_text = xmlstr.replace("ns0:", "").replace(":ns0", "") result_all_text = se.formatting.format_xml(result_all_text) with open(out_svg, "wt") as output: output.write(result_all_text)
def remove_image_metadata(filename: Path) -> None: """ Remove exif metadata from an image. INPUTS filename: A filename of an image OUTPUTS None. """ if filename.suffix == ".xcf" or filename.suffix == ".svg": # Skip GIMP XCF and SVG files return if filename.suffix == ".jpg": # JPEG images are lossy, and PIL will recompress them on save. # Instead of using PIL, read the byte stream and remove all metadata that way. # Inspired by https://github.com/hMatoba/Piexif with open(filename, "rb+") as file: jpeg_data = file.read() if jpeg_data[0:2] != b"\xff\xd8": raise se.InvalidFileException(f"Invalid JPEG file: [path][link=file://{filename.resolve()}]{filename}[/].") exif_segments = [] head = 2 # Get a list of metadata segments from the jpg while True: if jpeg_data[head: head + 2] == b"\xff\xda": break length = struct.unpack(">H", jpeg_data[head + 2: head + 4])[0] end_point = head + length + 2 seg = jpeg_data[head: end_point] head = end_point if head >= len(jpeg_data): raise se.InvalidFileException(f"Invalid JPEG file: [path][link=file://{filename.resolve()}]{filename}[/].") # See https://www.disktuna.com/list-of-jpeg-markers/ # and https://exiftool.org/TagNames/JPEG.html # These are the 15 "app" segments, EXCEPT app 14, as well as the "comment" segment. # This mirrors what exiftool does. metadata_segments = [b"\xff\xe1", b"\xff\xe2", b"\xff\xe3", b"\xff\xe4", b"\xff\xe5", b"\xff\xe6", b"\xff\xe7", b"\xff\xe8", b"\xff\xe9", b"\xff\xea", b"\xff\xeb", b"\xff\xec", b"\xff\xed", b"\xff\xef", b"\xff\xfe"] if seg[0:2] in metadata_segments: exif_segments.append(seg) # Now replace those segments with nothing for segment in exif_segments: jpeg_data = jpeg_data.replace(segment, b"") file.seek(0) file.write(jpeg_data) file.truncate() else: # PNG and other image types we expect are lossless so we can use PIL to remove metadata try: image = Image.open(filename) except UnidentifiedImageError as ex: raise se.InvalidFileException(f"Couldn’t identify image type of [path][link=file://{filename.resolve()}]{filename}[/].") from ex data = list(image.getdata()) image_without_exif = Image.new(image.mode, image.size) image_without_exif.putdata(data) if image.format == "PNG": # Some metadata, like chromaticity and gamma, are useful to preserve in PNGs new_exif = PngImagePlugin.PngInfo() for key, value in image.info.items(): if key.lower() == "gamma": new_exif.add(b"gAMA", struct.pack("!1I", int(value * 100000))) elif key.lower() == "chromaticity": new_exif.add(b"cHRM", struct.pack("!8I", \ int(value[0] * 100000), \ int(value[1] * 100000), \ int(value[2] * 100000), \ int(value[3] * 100000), \ int(value[4] * 100000), \ int(value[5] * 100000), \ int(value[6] * 100000), \ int(value[7] * 100000))) image_without_exif.save(filename, optimize=True, pnginfo=new_exif) elif image.format == "TIFF": # For some reason, when saving as TIFF we have to cast filename to str() otherwise # the save driver throws an exception image_without_exif.save(str(filename), compression="tiff_adobe_deflate") else: image_without_exif.save(str(filename))
def generate_endnotes(self) -> Tuple[int, int]: """ Read the epub spine to regenerate all endnotes in order of appearance, starting from 1. Changes are written to disk. Returns a tuple of (found_endnote_count, changed_endnote_count) """ processed = 0 current_note_number = 1 notes_changed = 0 change_list = [] for file_name in self.get_content_files(): if file_name in [ "titlepage.xhtml", "colophon.xhtml", "uncopyright.xhtml", "imprint.xhtml", "halftitlepage.xhtml", "endnotes.xhtml" ]: continue processed += 1 file_path = self.path / "src/epub/text" / file_name try: dom = self.get_dom(file_path) except Exception as ex: raise se.InvalidFileException( f"Couldn’t open file: [path][link=file://{file_path}]{file_path}[/][/]." ) from ex needs_rewrite = False for link in dom.xpath( "/html/body//a[contains(@epub:type, 'noteref')]"): old_anchor = "" href = link.get_attr("href") or "" if href: # Extract just the anchor from a URL (ie, what follows a hash symbol) hash_position = href.find( "#") + 1 # we want the characters AFTER the hash if hash_position > 0: old_anchor = href[hash_position:] new_anchor = f"note-{current_note_number:d}" if new_anchor != old_anchor: change_list.append( f"Changed {old_anchor} to {new_anchor} in {file_name}") notes_changed += 1 # Update the link in the dom link.set_attr("href", f"endnotes.xhtml#{new_anchor}") link.set_attr("id", f"noteref-{current_note_number:d}") link.lxml_element.text = str(current_note_number) needs_rewrite = True # Now try to find this in endnotes match_old = lambda x, old=old_anchor: x.anchor == old matches = list(filter(match_old, self.endnotes)) if not matches: raise se.InvalidInputException( f"Couldn’t find endnote with anchor [attr]{old_anchor}[/]." ) if len(matches) > 1: raise se.InvalidInputException( f"Duplicate anchors in endnotes file for anchor [attr]{old_anchor}[/]." ) # Found a single match, which is what we want endnote = matches[0] endnote.number = current_note_number endnote.matched = True # We don't change the anchor or the back ref just yet endnote.source_file = file_name current_note_number += 1 # If we need to write back the body text file if needs_rewrite: with open(file_path, "w") as file: file.write(se.formatting.format_xhtml(dom.to_string())) if processed == 0: raise se.InvalidInputException( "No files processed. Did you update the manifest and order the spine?" ) if notes_changed > 0: # Now we need to recreate the endnotes file endnotes_dom = self.get_dom(self.path / "src" / "epub" / "text" / "endnotes.xhtml") for ol_node in endnotes_dom.xpath( "/html/body/section[contains(@epub:type, 'endnotes')]/ol[1]" ): for node in ol_node.xpath( "./li[contains(@epub:type, 'endnote')]"): node.remove() self.endnotes.sort(key=lambda endnote: endnote.number) for endnote in self.endnotes: if endnote.matched: endnote.node.set_attr("id", f"note-{endnote.number}") for node in endnote.node.xpath( ".//a[contains(@epub:type, 'backlink')]"): node.set_attr( "href", f"{endnote.source_file}#noteref-{endnote.number}" ) ol_node.append(endnote.node) with open(self.path / "src" / "epub" / "text" / "endnotes.xhtml", "w") as file: file.write(se.formatting.format_xhtml( endnotes_dom.to_string())) return (current_note_number - 1, notes_changed)
def _create_draft(args: Namespace): """ Implementation for `se create-draft` """ # Put together some variables for later use identifier = se.formatting.make_url_safe( args.author) + "/" + se.formatting.make_url_safe(args.title) title_string = args.title.replace( "'", "’") + ", by " + args.author.replace("'", "’") sorted_title = regex.sub(r"^(A|An|The) (.+)$", "\\2, \\1", args.title) pg_producers = [] if args.translator: identifier = identifier + "/" + se.formatting.make_url_safe( args.translator) title_string = title_string + ". Translated by " + args.translator if args.illustrator: identifier = identifier + "/" + se.formatting.make_url_safe( args.illustrator) title_string = title_string + ". Illustrated by " + args.illustrator repo_name = identifier.replace("/", "_") repo_path = Path(repo_name).resolve() if repo_path.is_dir(): raise se.InvalidInputException( f"Directory already exists: [path][link=file://{repo_path}]{repo_path}[/][/]." ) # Download PG HTML and do some fixups if args.pg_url: if args.offline: raise se.RemoteCommandErrorException( "Cannot download Project Gutenberg ebook when offline option is enabled." ) args.pg_url = args.pg_url.replace("http://", "https://") # Get the ebook metadata try: response = requests.get(args.pg_url) pg_metadata_html = response.text except Exception as ex: raise se.RemoteCommandErrorException( f"Couldn’t download Project Gutenberg ebook metadata page. Exception: {ex}" ) soup = BeautifulSoup(pg_metadata_html, "lxml") # Get the ebook HTML URL from the metadata pg_ebook_url = None for element in soup.select("a[type^=\"text/html\"]"): pg_ebook_url = regex.sub(r"^//", "https://", element["href"]) pg_ebook_url = regex.sub(r"^/", "https://www.gutenberg.org/", pg_ebook_url) if not pg_ebook_url: raise se.RemoteCommandErrorException( "Could download ebook metadata, but couldn’t find URL for the ebook HTML." ) # Get the ebook LCSH categories pg_subjects = [] for element in soup.select("td[property=\"dcterms:subject\"]"): if element["datatype"] == "dcterms:LCSH": for subject_link in element.find("a"): pg_subjects.append(subject_link.strip()) # Get the PG publication date pg_publication_year = None for element in soup.select("td[itemprop=\"datePublished\"]"): pg_publication_year = regex.sub(r".+?([0-9]{4})", "\\1", element.text) # Get the actual ebook URL try: response = requests.get(pg_ebook_url) pg_ebook_html = response.text except Exception as ex: raise se.RemoteCommandErrorException( f"Couldn’t download Project Gutenberg ebook HTML. Exception: {ex}" ) try: fixed_pg_ebook_html = fix_text(pg_ebook_html, uncurl_quotes=False) pg_ebook_html = se.strip_bom(fixed_pg_ebook_html) except Exception as ex: raise se.InvalidEncodingException( f"Couldn’t determine text encoding of Project Gutenberg HTML file. Exception: {ex}" ) # Try to guess the ebook language pg_language = "en-US" if "colour" in pg_ebook_html or "favour" in pg_ebook_html or "honour" in pg_ebook_html: pg_language = "en-GB" # Create necessary directories (repo_path / "images").mkdir(parents=True) (repo_path / "src" / "epub" / "css").mkdir(parents=True) (repo_path / "src" / "epub" / "images").mkdir(parents=True) (repo_path / "src" / "epub" / "text").mkdir(parents=True) (repo_path / "src" / "META-INF").mkdir(parents=True) is_pg_html_parsed = True # Write PG data if we have it if args.pg_url and pg_ebook_html: try: soup = BeautifulSoup(pg_ebook_html, "html.parser") # Try to get the PG producers. We only try this if there's a <pre> block with the header info (which is not always the case) for element in soup(text=regex.compile(r"\*\*\*\s*Produced by.+$", flags=regex.DOTALL)): if element.parent.name == "pre": producers_text = regex.sub(r".+?Produced by (.+?)\s*$", "\\1", element, flags=regex.DOTALL) producers_text = regex.sub(r"\(.+?\)", "", producers_text, flags=regex.DOTALL) producers_text = regex.sub( r"(at )?https?://www\.pgdp\.net", "", producers_text, flags=regex.DOTALL) producers_text = regex.sub(r"[\r\n]+", " ", producers_text, flags=regex.DOTALL) producers_text = regex.sub(r",? and ", ", and ", producers_text) producers_text = producers_text.replace( " and the Online", " and The Online") producers_text = producers_text.replace(", and ", ", ").strip() pg_producers = producers_text.split(", ") # Try to strip out the PG header for element in soup(text=regex.compile(r"\*\*\*\s*START OF THIS")): for sibling in element.parent.find_previous_siblings(): sibling.decompose() element.parent.decompose() # Try to strip out the PG license footer for element in soup( text=regex.compile(r"End of (the )?Project Gutenberg")): for sibling in element.parent.find_next_siblings(): sibling.decompose() element.parent.decompose() with open(repo_path / "src" / "epub" / "text" / "body.xhtml", "w", encoding="utf-8") as file: file.write(str(soup)) except OSError as ex: raise se.InvalidFileException( f"Couldn’t write to ebook directory. Exception: {ex}") except: # Save this error for later, because it's still useful to complete the create-draft process # even if we've failed to parse PG's HTML source. is_pg_html_parsed = False se.quiet_remove(repo_path / "src" / "epub" / "text" / "body.xhtml") # Copy over templates _copy_template_file("gitignore", repo_path / ".gitignore") _copy_template_file("LICENSE.md", repo_path) _copy_template_file("container.xml", repo_path / "src" / "META-INF") _copy_template_file("mimetype", repo_path / "src") _copy_template_file("content.opf", repo_path / "src" / "epub") _copy_template_file("onix.xml", repo_path / "src" / "epub") _copy_template_file("toc.xhtml", repo_path / "src" / "epub") _copy_template_file("core.css", repo_path / "src" / "epub" / "css") _copy_template_file("local.css", repo_path / "src" / "epub" / "css") _copy_template_file("logo.svg", repo_path / "src" / "epub" / "images") _copy_template_file("colophon.xhtml", repo_path / "src" / "epub" / "text") _copy_template_file("imprint.xhtml", repo_path / "src" / "epub" / "text") _copy_template_file("titlepage.xhtml", repo_path / "src" / "epub" / "text") _copy_template_file("uncopyright.xhtml", repo_path / "src" / "epub" / "text") _copy_template_file("titlepage.svg", repo_path / "images") _copy_template_file("cover.jpg", repo_path / "images" / "cover.jpg") _copy_template_file("cover.svg", repo_path / "images" / "cover.svg") # Try to find Wikipedia links if possible if args.offline: author_wiki_url = None author_nacoaf_url = None ebook_wiki_url = None translator_wiki_url = None translator_nacoaf_url = None else: author_wiki_url, author_nacoaf_url = _get_wikipedia_url( args.author, True) ebook_wiki_url = None if args.title != "Short Fiction": # There's a "Short Fiction" Wikipedia article, so make an exception for that case ebook_wiki_url, _ = _get_wikipedia_url(args.title, False) translator_wiki_url = None if args.translator: translator_wiki_url, translator_nacoaf_url = _get_wikipedia_url( args.translator, True) # Pre-fill a few templates _replace_in_file(repo_path / "src" / "epub" / "text" / "titlepage.xhtml", "TITLE_STRING", title_string) _replace_in_file(repo_path / "images" / "titlepage.svg", "TITLE_STRING", title_string) _replace_in_file(repo_path / "images" / "cover.svg", "TITLE_STRING", title_string) # Create the titlepage SVG contributors = {} if args.translator: contributors["translated by"] = args.translator if args.illustrator: contributors["illustrated by"] = args.illustrator with open(repo_path / "images" / "titlepage.svg", "w", encoding="utf-8") as file: file.write( _generate_titlepage_svg(args.title, args.author, contributors, title_string)) # Create the cover SVG with open(repo_path / "images" / "cover.svg", "w", encoding="utf-8") as file: file.write(_generate_cover_svg(args.title, args.author, title_string)) # Build the cover/titlepage for distribution epub = SeEpub(repo_path) epub.generate_cover_svg() epub.generate_titlepage_svg() if args.pg_url: _replace_in_file(repo_path / "src" / "epub" / "text" / "imprint.xhtml", "PG_URL", args.pg_url) with open(repo_path / "src" / "epub" / "text" / "colophon.xhtml", "r+", encoding="utf-8") as file: colophon_xhtml = file.read() colophon_xhtml = colophon_xhtml.replace("SE_IDENTIFIER", identifier) colophon_xhtml = colophon_xhtml.replace(">AUTHOR<", f">{args.author}<") colophon_xhtml = colophon_xhtml.replace("TITLE", args.title) if author_wiki_url: colophon_xhtml = colophon_xhtml.replace("AUTHOR_WIKI_URL", author_wiki_url) if args.pg_url: colophon_xhtml = colophon_xhtml.replace("PG_URL", args.pg_url) if pg_publication_year: colophon_xhtml = colophon_xhtml.replace( "PG_YEAR", pg_publication_year) if pg_producers: producers_xhtml = "" for i, producer in enumerate(pg_producers): if "Distributed Proofread" in producer: producers_xhtml = producers_xhtml + "<a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a>" elif "anonymous" in producer.lower(): producers_xhtml = producers_xhtml + "<b class=\"name\">An Anonymous Volunteer</b>" else: producers_xhtml = producers_xhtml + f"<b class=\"name\">{producer.strip('.')}</b>" if i < len(pg_producers) - 1: producers_xhtml = producers_xhtml + ", " if i == len(pg_producers) - 2: producers_xhtml = producers_xhtml + "and " producers_xhtml = producers_xhtml + "<br/>" colophon_xhtml = colophon_xhtml.replace( "<b class=\"name\">TRANSCRIBER_1</b>, <b class=\"name\">TRANSCRIBER_2</b>, and <a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a><br/>", producers_xhtml) file.seek(0) file.write(colophon_xhtml) file.truncate() with open(repo_path / "src" / "epub" / "content.opf", "r+", encoding="utf-8") as file: metadata_xml = file.read() metadata_xml = metadata_xml.replace("SE_IDENTIFIER", identifier) metadata_xml = metadata_xml.replace(">AUTHOR<", f">{args.author}<") metadata_xml = metadata_xml.replace(">TITLE_SORT<", f">{sorted_title}<") metadata_xml = metadata_xml.replace(">TITLE<", f">{args.title}<") metadata_xml = metadata_xml.replace("VCS_IDENTIFIER", str(repo_name)) if pg_producers: producers_xhtml = "" i = 1 for producer in pg_producers: if "Distributed Proofread" in producer: producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">The Online Distributed Proofreading Team</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Online Distributed Proofreading Team, The</meta>\n\t\t<meta property=\"se:url.homepage\" refines=\"#transcriber-{i}\">https://pgdp.net</meta>\n" elif "anonymous" in producer.lower(): producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">An Anonymous Volunteer</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Anonymous Volunteer, An</meta>\n" else: producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">{producer.strip('.')}</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">TRANSCRIBER_SORT</meta>\n" producers_xhtml = producers_xhtml + f"\t\t<meta property=\"role\" refines=\"#transcriber-{i}\" scheme=\"marc:relators\">trc</meta>\n" i = i + 1 metadata_xml = regex.sub( r"\t\t<dc:contributor id=\"transcriber-1\">TRANSCRIBER</dc:contributor>\s*<meta property=\"file-as\" refines=\"#transcriber-1\">TRANSCRIBER_SORT</meta>\s*<meta property=\"se:url.homepage\" refines=\"#transcriber-1\">TRANSCRIBER_URL</meta>\s*<meta property=\"role\" refines=\"#transcriber-1\" scheme=\"marc:relators\">trc</meta>", "\t\t" + producers_xhtml.strip(), metadata_xml, flags=regex.DOTALL) if author_wiki_url: metadata_xml = metadata_xml.replace(">AUTHOR_WIKI_URL<", f">{author_wiki_url}<") if author_nacoaf_url: metadata_xml = metadata_xml.replace(">AUTHOR_NACOAF_URL<", f">{author_nacoaf_url}<") if ebook_wiki_url: metadata_xml = metadata_xml.replace(">EBOOK_WIKI_URL<", f">{ebook_wiki_url}<") if args.translator: metadata_xml = metadata_xml.replace(">TRANSLATOR<", f">{args.translator}<") if translator_wiki_url: metadata_xml = metadata_xml.replace( ">TRANSLATOR_WIKI_URL<", f">{translator_wiki_url}<") if translator_nacoaf_url: metadata_xml = metadata_xml.replace( ">TRANSLATOR_NACOAF_URL<", f">{translator_nacoaf_url}<") else: metadata_xml = regex.sub( r"<dc:contributor id=\"translator\">.+?<dc:contributor id=\"artist\">", "<dc:contributor id=\"artist\">", metadata_xml, flags=regex.DOTALL) if args.pg_url: if pg_subjects: subject_xhtml = "" i = 1 for subject in pg_subjects: subject_xhtml = subject_xhtml + f"\t\t<dc:subject id=\"subject-{i}\">{subject}</dc:subject>\n" i = i + 1 i = 1 for subject in pg_subjects: subject_xhtml = subject_xhtml + f"\t\t<meta property=\"authority\" refines=\"#subject-{i}\">LCSH</meta>\n" # Now, get the LCSH ID by querying LCSH directly. try: response = requests.get( f"https://id.loc.gov/search/?q=%22{urllib.parse.quote(subject)}%22" ) result = regex.search( fr"<a title=\"Click to view record\" href=\"/authorities/subjects/([^\"]+?)\">{regex.escape(subject.replace(' -- ', '--'))}</a>", response.text) loc_id = "Unknown" try: loc_id = result.group(1) except Exception as ex: pass subject_xhtml = subject_xhtml + f"\t\t<meta property=\"term\" refines=\"#subject-{i}\">{loc_id}</meta>\n" except Exception as ex: raise se.RemoteCommandErrorException( f"Couldn’t connect to [url][link=https://id.loc.gov]https://id.loc.gov[/][/]. Exception: {ex}" ) i = i + 1 metadata_xml = regex.sub( r"\t\t<dc:subject id=\"subject-1\">SUBJECT_1</dc:subject>\s*<dc:subject id=\"subject-2\">SUBJECT_2</dc:subject>\s*<meta property=\"authority\" refines=\"#subject-1\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-1\">LCSH_ID_1</meta>\s*<meta property=\"authority\" refines=\"#subject-2\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-2\">LCSH_ID_2</meta>", "\t\t" + subject_xhtml.strip(), metadata_xml) metadata_xml = metadata_xml.replace( "<dc:language>LANG</dc:language>", f"<dc:language>{pg_language}</dc:language>") metadata_xml = metadata_xml.replace( "<dc:source>PG_URL</dc:source>", f"<dc:source>{args.pg_url}</dc:source>") file.seek(0) file.write(metadata_xml) file.truncate() # Set up local git repo repo = git.Repo.init(repo_path) if args.email: with repo.config_writer() as config: config.set_value("user", "email", args.email) if args.pg_url and pg_ebook_html and not is_pg_html_parsed: raise se.InvalidXhtmlException( "Couldn’t parse Project Gutenberg ebook source. This is usually due to invalid HTML in the ebook." )
def process_all_content(file_list: list) -> Tuple[list, list]: """ Analyze the whole content of the project, build and return lists if toc_items and landmarks. INPUTS: file_list: a list of all content files text_path: the path to the contents folder (src/epub/text) OUTPUTS: a tuple containing the list of Toc items and the list of landmark items """ toc_list: List[TocItem] = [] landmarks: List[TocItem] = [] # We make two passes through the work, because we need to know # how many bodymatter items there are. So we do landmarks first. for textf in file_list: try: with open(textf, encoding="utf-8") as file: dom = se.easy_xml.EasyXmlTree(file.read()) except Exception as ex: raise se.InvalidFileException( f"Couldn’t open file: [path][link=file://{textf}]{textf}[/][/]. Exception: {ex}" ) from ex add_landmark(dom, textf.name, landmarks) # Now we test to see if there is only one body item body_items = [item for item in landmarks if item.place == Position.BODY] single_file = (len(body_items) == 1) nest_under_halftitle = False for textf in file_list: with open(textf, "r", encoding="utf-8") as file: dom = se.easy_xml.EasyXmlTree(file.read()) process_headings(dom, textf.name, toc_list, nest_under_halftitle, single_file) if dom.xpath("/html/body//*[contains(@epub:type, 'halftitlepage')]"): nest_under_halftitle = True # now go through adjusting for nesting under halftitle if nest_under_halftitle: # tricky because a few books have forewords, etc AFTER the halftitle, so have to know if we've passed it passed_halftitle = False for toc_item in toc_list: if toc_item.place == Position.BODY: toc_item.level += 1 if passed_halftitle and toc_item.place == Position.FRONT: toc_item.level += 1 if "halftitle" in toc_item.file_link: passed_halftitle = True # We add this dummy item because outputtoc always needs to look ahead to the next item. last_toc = TocItem() last_toc.level = 1 last_toc.title = "dummy" toc_list.append(last_toc) return landmarks, toc_list
def generate_endnotes(self) -> str: """ The generate_endnotes() function is very big so for readability and maintainability it's broken out to a separate file. Strictly speaking that file can be inlined into this class. """ processed = 0 report = "" current_note_number = 1 notes_changed = 0 change_list = [] for file_name in self.get_content_files(): if file_name in [ "titlepage.xhtml", "colophon.xhtml", "uncopyright.xhtml", "imprint.xhtml", "halftitle.xhtml", "endnotes.xhtml" ]: continue processed += 1 file_path = self.path / "src" / "epub" / "text" / file_name try: with open(file_path) as file: soup = BeautifulSoup(file.read(), "lxml") except: raise se.InvalidFileException("Could't open file: {}".format( str(file_path))) links = soup.find_all("a") needs_rewrite = False for link in links: epub_type = link.get("epub:type") or "" if epub_type == "noteref": old_anchor = "" href = link.get("href") or "" if href: # Extract just the anchor from a URL (ie, what follows a hash symbol) old_anchor = "" hash_position = href.find( "#") + 1 # we want the characters AFTER the hash if hash_position > 0: old_anchor = href[hash_position:] new_anchor = "note-{:d}".format(current_note_number) if new_anchor != old_anchor: change_list.append("Changed " + old_anchor + " to " + new_anchor + " in " + file_name) notes_changed += 1 # Update the link in the soup object link["href"] = 'endnotes.xhtml#' + new_anchor link["id"] = 'noteref-{:d}'.format(current_note_number) link.string = str(current_note_number) needs_rewrite = True # Now try to find this in endnotes matches = list( filter(lambda x, old=old_anchor: x.anchor == old, self.endnotes)) if not matches: raise se.InvalidInputException( "Couldn't find endnote with anchor " + old_anchor) if len(matches) > 1: raise se.InvalidInputException( "Duplicate anchors in endnotes file for anchor " + old_anchor) # Found a single match, which is what we want endnote = matches[0] endnote.number = current_note_number endnote.matched = True # We don't change the anchor or the back ref just yet endnote.source_file = file_name current_note_number += 1 # If we need to write back the body text file if needs_rewrite: new_file = open(file_path, "w") new_file.write(se.formatting.format_xhtml(str(soup))) new_file.close() if processed == 0: report += "No files processed. Did you update the manifest and order the spine?" + "\n" else: report += "Found {:d} endnotes.".format(current_note_number - 1) + "\n" if notes_changed > 0: # Now we need to recreate the endnotes file ol_tag = self._endnotes_soup.ol ol_tag.clear() for endnote in self.endnotes: if endnote.matched: li_tag = self._endnotes_soup.new_tag("li") li_tag["id"] = "note-" + str(endnote.number) li_tag["epub:type"] = "endnote" for content in endnote.contents: if isinstance(content, Tag): links = content.find_all("a") for link in links: epub_type = link.get("epub:type") or "" if epub_type == "se:referrer": href = link.get("href") or "" if href: link[ "href"] = endnote.source_file + "#noteref-" + str( endnote.number) li_tag.append(content) ol_tag.append(li_tag) with open( self.path / "src" / "epub" / "text" / "endnotes.xhtml", "w") as file: file.write( se.formatting.format_xhtml(str(self._endnotes_soup), is_endnotes_file=True)) report += "Changed {:d} endnote{}.".format( notes_changed, "s" if notes_changed != 1 else "") else: report += "No changes made." return report
def generate_endnotes(self) -> Tuple[int, int]: """ Read the epub spine to regenerate all endnotes in order of appearance, starting from 1. Changes are written to disk. Returns a tuple of (found_endnote_count, changed_endnote_count) """ processed = 0 current_note_number = 1 notes_changed = 0 change_list = [] for file_name in self.get_content_files(): if file_name in ["titlepage.xhtml", "colophon.xhtml", "uncopyright.xhtml", "imprint.xhtml", "halftitle.xhtml", "endnotes.xhtml"]: continue processed += 1 file_path = self.path / "src/epub/text" / file_name try: with open(file_path) as file: soup = BeautifulSoup(file.read(), "lxml") except: raise se.InvalidFileException(f"Couldn’t open file: [path][link=file://{file_path}]{file_path}[/][/].") links = soup.find_all("a") needs_rewrite = False for link in links: epub_type = link.get("epub:type") or "" if epub_type == "noteref": old_anchor = "" href = link.get("href") or "" if href: # Extract just the anchor from a URL (ie, what follows a hash symbol) old_anchor = "" hash_position = href.find("#") + 1 # we want the characters AFTER the hash if hash_position > 0: old_anchor = href[hash_position:] new_anchor = f"note-{current_note_number:d}" if new_anchor != old_anchor: change_list.append(f"Changed {old_anchor} to {new_anchor} in {file_name}") notes_changed += 1 # Update the link in the soup object link["href"] = 'endnotes.xhtml#' + new_anchor link["id"] = f'noteref-{current_note_number:d}' link.string = str(current_note_number) needs_rewrite = True # Now try to find this in endnotes match_old = lambda x, old=old_anchor: x.anchor == old matches = list(filter(match_old, self.endnotes)) if not matches: raise se.InvalidInputException(f"Couldn’t find endnote with anchor [attr]{old_anchor}[/].") if len(matches) > 1: raise se.InvalidInputException(f"Duplicate anchors in endnotes file for anchor [attr]{old_anchor}[/].") # Found a single match, which is what we want endnote = matches[0] endnote.number = current_note_number endnote.matched = True # We don't change the anchor or the back ref just yet endnote.source_file = file_name current_note_number += 1 # If we need to write back the body text file if needs_rewrite: new_file = open(file_path, "w") new_file.write(se.formatting.format_xhtml(str(soup))) new_file.close() if processed == 0: raise se.InvalidInputException("No files processed. Did you update the manifest and order the spine?") if notes_changed > 0: # Now we need to recreate the endnotes file ol_tag = self._endnotes_soup.ol ol_tag.clear() self.endnotes.sort(key=lambda endnote: endnote.number) for endnote in self.endnotes: if endnote.matched: li_tag = self._endnotes_soup.new_tag("li") li_tag["id"] = "note-" + str(endnote.number) li_tag["epub:type"] = "endnote" for content in endnote.contents: if isinstance(content, Tag): links = content.find_all("a") for link in links: epub_type = link.get("epub:type") or "" if epub_type == "backlink": href = link.get("href") or "" if href: link["href"] = endnote.source_file + "#noteref-" + str(endnote.number) li_tag.append(content) ol_tag.append(li_tag) with open(self.path / "src" / "epub" / "text" / "endnotes.xhtml", "w") as file: file.write(se.formatting.format_xhtml(str(self._endnotes_soup))) return (current_note_number - 1, notes_changed)
def create_draft(args: list): """ Entry point for `se create-draft` """ # Put together some variables for later use identifier = se.formatting.make_url_safe( args.author) + "/" + se.formatting.make_url_safe(args.title) title_string = args.title.replace( "'", "’") + ", by " + args.author.replace("'", "’") sorted_title = regex.sub(r"^(A|An|The) (.+)$", "\\2, \\1", args.title) pg_producers = [] if args.translator: identifier = identifier + "/" + se.formatting.make_url_safe( args.translator) title_string = title_string + ". Translated by " + args.translator if args.illustrator: identifier = identifier + "/" + se.formatting.make_url_safe( args.illustrator) title_string = title_string + ". Illustrated by " + args.illustrator repo_name = Path(identifier.replace("/", "_")) if repo_name.is_dir(): raise se.InvalidInputException( "./{}/ already exists.".format(repo_name)) # Download PG HTML and do some fixups if args.pg_url: args.pg_url = args.pg_url.replace("http://", "https://") # Get the ebook metadata try: response = requests.get(args.pg_url) pg_metadata_html = response.text except Exception as ex: raise se.RemoteCommandErrorException( "Couldn’t download Project Gutenberg ebook metadata page. Error: {}" .format(ex)) soup = BeautifulSoup(pg_metadata_html, "lxml") # Get the ebook HTML URL from the metadata pg_ebook_url = None for element in soup.select("a[type^=\"text/html\"]"): pg_ebook_url = regex.sub(r"^//", "https://", element["href"]) pg_ebook_url = regex.sub(r"^/", "https://www.gutenberg.org/", pg_ebook_url) if not pg_ebook_url: raise se.RemoteCommandErrorException( "Could download ebook metadata, but couldn’t find URL for the ebook HTML." ) # Get the ebook LCSH categories pg_subjects = [] for element in soup.select("td[property=\"dcterms:subject\"]"): if element["datatype"] == "dcterms:LCSH": for subject_link in element.find("a"): pg_subjects.append(subject_link.strip()) # Get the PG publication date pg_publication_year = None for element in soup.select("td[itemprop=\"datePublished\"]"): pg_publication_year = regex.sub(r".+?([0-9]{4})", "\\1", element.text) # Get the actual ebook URL try: response = requests.get(pg_ebook_url) pg_ebook_html = response.text except Exception as ex: raise se.RemoteCommandErrorException( "Couldn’t download Project Gutenberg ebook HTML. Error: {}". format(ex)) try: fixed_pg_ebook_html = fix_text(pg_ebook_html, uncurl_quotes=False) pg_ebook_html = se.strip_bom(fixed_pg_ebook_html) except Exception as ex: raise se.InvalidEncodingException( "Couldn’t determine text encoding of Project Gutenberg HTML file. Error: {}" .format(ex)) # Try to guess the ebook language pg_language = "en-US" if "colour" in pg_ebook_html or "favour" in pg_ebook_html or "honour" in pg_ebook_html: pg_language = "en-GB" # Create necessary directories (repo_name / "images").mkdir(parents=True) (repo_name / "src" / "epub" / "css").mkdir(parents=True) (repo_name / "src" / "epub" / "images").mkdir(parents=True) (repo_name / "src" / "epub" / "text").mkdir(parents=True) (repo_name / "src" / "META-INF").mkdir(parents=True) # Write PG data if we have it if args.pg_url and pg_ebook_html: try: soup = BeautifulSoup(pg_ebook_html, "html.parser") # Try to get the PG producers. We only try this if there's a <pre> block with the header info (which is not always the case) for element in soup(text=regex.compile(r"\*\*\*\s*Produced by.+$", flags=regex.DOTALL)): if element.parent.name == "pre": pg_producers = regex.sub(r".+?Produced by (.+?)\s*$", "\\1", element, flags=regex.DOTALL) pg_producers = regex.sub(r"\(.+?\)", "", pg_producers, flags=regex.DOTALL) pg_producers = regex.sub(r"(at )?https?://www\.pgdp\.net", "", pg_producers, flags=regex.DOTALL) pg_producers = regex.sub(r"[\r\n]+", " ", pg_producers, flags=regex.DOTALL) pg_producers = regex.sub(r",? and ", ", and ", pg_producers) pg_producers = pg_producers.replace( " and the Online", " and The Online") pg_producers = pg_producers.replace( ", and ", ", ").strip().split(", ") # Try to strip out the PG header for element in soup(text=regex.compile(r"\*\*\*\s*START OF THIS")): for sibling in element.parent.find_previous_siblings(): sibling.decompose() element.parent.decompose() # Try to strip out the PG license footer for element in soup( text=regex.compile(r"End of (the )?Project Gutenberg")): for sibling in element.parent.find_next_siblings(): sibling.decompose() element.parent.decompose() with open(repo_name / "src" / "epub" / "text" / "body.xhtml", "w", encoding="utf-8") as file: file.write(str(soup)) except IOError as ex: raise se.InvalidFileException( "Couldn’t write to ebook directory. Error: {}".format(ex)) except: raise se.InvalidInputException( "Couldn’t parse Project Gutenberg ebook source. This is usually due to invalid HTML in the ebook." ) # Copy over templates shutil.copy( resource_filename("se", str(Path("data") / "templates" / "gitignore")), repo_name / ".gitignore") shutil.copy( resource_filename("se", str(Path("data") / "templates" / "LICENSE.md")), repo_name) shutil.copy( resource_filename( "se", str(Path("data") / "templates" / "META-INF" / "container.xml")), repo_name / "src" / "META-INF") shutil.copy( resource_filename("se", str(Path("data") / "templates" / "mimetype")), repo_name / "src") shutil.copy( resource_filename("se", str(Path("data") / "templates" / "content.opf")), repo_name / "src" / "epub") shutil.copy( resource_filename("se", str(Path("data") / "templates" / "onix.xml")), repo_name / "src" / "epub") shutil.copy( resource_filename("se", str(Path("data") / "templates" / "toc.xhtml")), repo_name / "src" / "epub") shutil.copy( resource_filename("se", str(Path("data") / "templates" / "core.css")), repo_name / "src" / "epub" / "css") shutil.copy( resource_filename("se", str(Path("data") / "templates" / "local.css")), repo_name / "src" / "epub" / "css") shutil.copy( resource_filename("se", str(Path("data") / "templates" / "logo.svg")), repo_name / "src" / "epub" / "images") shutil.copy( resource_filename("se", str(Path("data") / "templates" / "colophon.xhtml")), repo_name / "src" / "epub" / "text") shutil.copy( resource_filename("se", str(Path("data") / "templates" / "imprint.xhtml")), repo_name / "src" / "epub" / "text") shutil.copy( resource_filename("se", str(Path("data") / "templates" / "titlepage.xhtml")), repo_name / "src" / "epub" / "text") shutil.copy( resource_filename( "se", str(Path("data") / "templates" / "uncopyright.xhtml")), repo_name / "src" / "epub" / "text") shutil.copy( resource_filename("se", str(Path("data") / "templates" / "titlepage.svg")), repo_name / "images") shutil.copy( resource_filename("se", str(Path("data") / "templates" / "cover.jpg")), repo_name / "images" / "cover.jpg") shutil.copy( resource_filename("se", str(Path("data") / "templates" / "cover.svg")), repo_name / "images" / "cover.svg") # Try to find Wikipedia links if possible author_wiki_url, author_nacoaf_url = _get_wikipedia_url(args.author, True) ebook_wiki_url, _ = _get_wikipedia_url(args.title, False) translator_wiki_url = None if args.translator: translator_wiki_url, translator_nacoaf_url = _get_wikipedia_url( args.translator, True) # Pre-fill a few templates se.replace_in_file(repo_name / "src" / "epub" / "text" / "titlepage.xhtml", "TITLE_STRING", title_string) se.replace_in_file(repo_name / "images" / "titlepage.svg", "TITLE_STRING", title_string) se.replace_in_file(repo_name / "images" / "cover.svg", "TITLE_STRING", title_string) # Create the titlepage SVG contributors = {} if args.translator: contributors["translated by"] = args.translator if args.illustrator: contributors["illustrated by"] = args.illustrator with open(repo_name / "images" / "titlepage.svg", "w", encoding="utf-8") as file: file.write( _generate_titlepage_svg(args.title, args.author, contributors, title_string)) # Create the cover SVG with open(repo_name / "images" / "cover.svg", "w", encoding="utf-8") as file: file.write(_generate_cover_svg(args.title, args.author, title_string)) if args.pg_url: se.replace_in_file( repo_name / "src" / "epub" / "text" / "imprint.xhtml", "PG_URL", args.pg_url) with open(repo_name / "src" / "epub" / "text" / "colophon.xhtml", "r+", encoding="utf-8") as file: colophon_xhtml = file.read() colophon_xhtml = colophon_xhtml.replace("SE_IDENTIFIER", identifier) colophon_xhtml = colophon_xhtml.replace(">AUTHOR<", ">{}<".format(args.author)) colophon_xhtml = colophon_xhtml.replace("TITLE", args.title) if author_wiki_url: colophon_xhtml = colophon_xhtml.replace("AUTHOR_WIKI_URL", author_wiki_url) if args.pg_url: colophon_xhtml = colophon_xhtml.replace("PG_URL", args.pg_url) if pg_publication_year: colophon_xhtml = colophon_xhtml.replace( "PG_YEAR", pg_publication_year) if pg_producers: producers_xhtml = "" for i, producer in enumerate(pg_producers): if "Distributed Proofreading" in producer: producers_xhtml = producers_xhtml + "<a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a>" else: producers_xhtml = producers_xhtml + "<b class=\"name\">{}</b>".format( producer) if i < len(pg_producers) - 1: producers_xhtml = producers_xhtml + ", " if i == len(pg_producers) - 2: producers_xhtml = producers_xhtml + "and " producers_xhtml = producers_xhtml + "<br/>" colophon_xhtml = colophon_xhtml.replace( "<b class=\"name\">TRANSCRIBER_1</b>, <b class=\"name\">TRANSCRIBER_2</b>, and <a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a><br/>", producers_xhtml) file.seek(0) file.write(colophon_xhtml) file.truncate() with open(repo_name / "src" / "epub" / "content.opf", "r+", encoding="utf-8") as file: metadata_xhtml = file.read() metadata_xhtml = metadata_xhtml.replace("SE_IDENTIFIER", identifier) metadata_xhtml = metadata_xhtml.replace(">AUTHOR<", ">{}<".format(args.author)) metadata_xhtml = metadata_xhtml.replace(">TITLE_SORT<", ">{}<".format(sorted_title)) metadata_xhtml = metadata_xhtml.replace(">TITLE<", ">{}<".format(args.title)) metadata_xhtml = metadata_xhtml.replace("VCS_IDENTIFIER", str(repo_name)) if pg_producers: producers_xhtml = "" i = 1 for producer in pg_producers: producers_xhtml = producers_xhtml + "\t\t<dc:contributor id=\"transcriber-{}\">{}</dc:contributor>\n".format( i, producer) if "Distributed Proofreading" in producer: producers_xhtml = producers_xhtml + "\t\t<meta property=\"file-as\" refines=\"#transcriber-{0}\">Online Distributed Proofreading Team, The</meta>\n\t\t<meta property=\"se:url.homepage\" refines=\"#transcriber-{0}\">https://pgdp.net</meta>\n".format( i) else: producers_xhtml = producers_xhtml + "\t\t<meta property=\"file-as\" refines=\"#transcriber-{}\">TRANSCRIBER_SORT</meta>\n".format( i) producers_xhtml = producers_xhtml + "\t\t<meta property=\"role\" refines=\"#transcriber-{}\" scheme=\"marc:relators\">trc</meta>\n".format( i) i = i + 1 metadata_xhtml = regex.sub( r"\t\t<dc:contributor id=\"transcriber-1\">TRANSCRIBER</dc:contributor>\s*<meta property=\"file-as\" refines=\"#transcriber-1\">TRANSCRIBER_SORT</meta>\s*<meta property=\"se:url.homepage\" refines=\"#transcriber-1\">TRANSCRIBER_URL</meta>\s*<meta property=\"role\" refines=\"#transcriber-1\" scheme=\"marc:relators\">trc</meta>", "\t\t" + producers_xhtml.strip(), metadata_xhtml, flags=regex.DOTALL) if author_wiki_url: metadata_xhtml = metadata_xhtml.replace( ">AUTHOR_WIKI_URL<", ">{}<".format(author_wiki_url)) if author_nacoaf_url: metadata_xhtml = metadata_xhtml.replace( ">AUTHOR_NACOAF_URL<", ">{}<".format(author_nacoaf_url)) if ebook_wiki_url: metadata_xhtml = metadata_xhtml.replace( ">EBOOK_WIKI_URL<", ">{}<".format(ebook_wiki_url)) if args.translator: metadata_xhtml = metadata_xhtml.replace( ">TRANSLATOR<", ">{}<".format(args.translator)) if translator_wiki_url: metadata_xhtml = metadata_xhtml.replace( ">TRANSLATOR_WIKI_URL<", ">{}<".format(translator_wiki_url)) if translator_nacoaf_url: metadata_xhtml = metadata_xhtml.replace( ">TRANSLATOR_NACOAF_URL<", ">{}<".format(translator_nacoaf_url)) else: metadata_xhtml = regex.sub( r"<dc:contributor id=\"translator\">.+?<dc:contributor id=\"artist\">", "<dc:contributor id=\"artist\">", metadata_xhtml, flags=regex.DOTALL) if args.pg_url: if pg_subjects: subject_xhtml = "" i = 1 for subject in pg_subjects: subject_xhtml = subject_xhtml + "\t\t<dc:subject id=\"subject-{}\">{}</dc:subject>\n".format( i, subject) i = i + 1 i = 1 for subject in pg_subjects: subject_xhtml = subject_xhtml + "\t\t<meta property=\"authority\" refines=\"#subject-{}\">LCSH</meta>\n".format( i) # Now, get the LCSH ID by querying LCSH directly. try: response = requests.get( "http://id.loc.gov/search/?q=%22{}%22".format( urllib.parse.quote(subject))) result = regex.search( r"<a title=\"Click to view record\" href=\"/authorities/subjects/([^\"]+?)\">{}</a>" .format(regex.escape(subject.replace(" -- ", "--"))), response.text) loc_id = "Unknown" try: loc_id = result.group(1) except Exception as ex: pass subject_xhtml = subject_xhtml + "\t\t<meta property=\"term\" refines=\"#subject-{}\">{}</meta>\n".format( i, loc_id) except Exception as ex: raise se.RemoteCommandErrorException( "Couldn’t connect to id.loc.gov. Error: {}".format( ex)) i = i + 1 metadata_xhtml = regex.sub( r"\t\t<dc:subject id=\"subject-1\">SUBJECT_1</dc:subject>\s*<dc:subject id=\"subject-2\">SUBJECT_2</dc:subject>\s*<meta property=\"authority\" refines=\"#subject-1\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-1\">LCSH_ID_1</meta>\s*<meta property=\"authority\" refines=\"#subject-2\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-2\">LCSH_ID_2</meta>", "\t\t" + subject_xhtml.strip(), metadata_xhtml) metadata_xhtml = metadata_xhtml.replace( "<dc:language>LANG</dc:language>", "<dc:language>{}</dc:language>".format(pg_language)) metadata_xhtml = metadata_xhtml.replace( "<dc:source>PG_URL</dc:source>", "<dc:source>{}</dc:source>".format(args.pg_url)) file.seek(0) file.write(metadata_xhtml) file.truncate() # Set up local git repo repo = git.Repo.init(repo_name) if args.email: with repo.config_writer() as config: config.set_value("user", "email", args.email) # Set up remote git repos if args.create_se_repo: git_command = git.cmd.Git(repo_name) git_command.remote( "add", "origin", "standardebooks.org:/standardebooks.org/ebooks/{}.git".format( repo_name)) # Set git to automatically push to SE git_command.config("branch.master.remote", "origin") git_command.config("branch.master.merge", "refs/heads/master") github_option = "" if args.create_github_repo: github_option = "--github" return_code = call([ "ssh", "standardebooks.org", "/standardebooks.org/scripts/init-se-repo --repo-name={} --title-string=\"{}\" {}" .format(repo_name, title_string, github_option) ]) if return_code != 0: raise se.RemoteCommandErrorException( "Failed to create repository on Standard Ebooks server: ssh returned code {}." .format(return_code))