def process_file(filepath: str) -> (str, str): """ Run through each file, locating titles and updating <title> tag. INPUTS: filepath: path to content file OUTPUTS: altered xhtml file text and new section ID (as a tuple) """ xhtml = gethtml(filepath) soup = BeautifulSoup(xhtml, "html.parser") heading = soup.find( ["h2", "h3", "h4", "h5", "h6"]) # find first heading, not interested in h1 in halftitle if heading: title_info = process_first_heading(heading) title_tag = soup.find("title") sections = heading.find_parents("section") get_part_prefix(title_info, sections) new_id = title_info.generate_id() section = heading.find_parent("section") if section: section["id"] = new_id if title_tag: title_tag.clear() title_tag.append(title_info.output_title_tag()) return format_xhtml(str(soup)), new_id # failure, so return blanks return "", ""
def recreate(textpath: str, notes_soup: BeautifulSoup, endnotes: list): """ rebuilds endnotes.xhtml in the correct (possibly new) order :param textpath: path to text folder in SE project :param notes_soup: :param endnotes: :return: """ ol = notes_soup.ol ol.clear() endnotes.sort(key=lambda enote: enote.number) for endnote in endnotes: if endnote.matched: li = notes_soup.new_tag("li") li["id"] = "note-" + str(endnote.number) li["epub:type"] = "endnote" for content in endnote.contents: if isinstance(content, Tag): links = content.find_all("a") for link in links: epub_type = link.get("epub:type") or "" if "se:referrer" in epub_type or "backlink" in epub_type: href = link.get("href") or "" if href: link[ "href"] = endnote.source_file + "#noteref-" + str( endnote.number) li.append(content) ol.append(li) new_file = open(os.path.join(textpath, "endnotes.xhtml"), "w") new_file.write(format_xhtml(str(notes_soup))) new_file.close()
def output_toc(item_list: list, landmark_list, toc_path: str, work_type: str, work_title: str) -> str: """ Outputs the contructed ToC based on the lists of items and landmarks found, either to stdout or overwriting the existing ToC file """ if len(item_list) < 2: raise se.InvalidInputException("Too few ToC items found.") existing_toc: BeautifulSoup = get_existing_toc(toc_path) if existing_toc is None: raise se.InvalidInputException("Existing ToC not found.") # There should be exactly two nav sections. navs = existing_toc.find_all("nav") if len(navs) < 2: raise se.InvalidInputException( "Existing ToC has too few nav sections.") item_ol = navs[0].find("ol") item_ol.clear() landmark_ol = navs[1].find("ol") landmark_ol.clear() new_items = BeautifulSoup(process_items(item_list), "html.parser") item_ol.append(new_items) new_landmarks = BeautifulSoup( process_landmarks(landmark_list, work_type, work_title), "html.parser") landmark_ol.append(new_landmarks) return format_xhtml(str(existing_toc))
def process_file(text_path: str, file_name: str, endnotes: list, de_orphan: bool, current_note_number: int) -> int: """ Reads a content file, locates and processes the endnotes, accumulating info on them in a global list, and returns the next note number :param text_path: path to the text files in the project :param file_name: the name of the file being processed eg chapter-1.xhtml :param endnotes: list of notes we are building :param de_orphan: remove reference in text if no matching endnote :param current_note_number: the current note number we are allocating :return: the next note number to use """ global notes_changed file_path = os.path.join(text_path, file_name) xhtml = gethtml(file_path) soup = BeautifulSoup(xhtml, "lxml") links = soup.find_all("a") needs_rewrite = False for link in links: epub_type = link.get("epub:type") or "" if epub_type == "noteref": old_anchor = "" href = link.get("href") or "" if href: old_anchor = extract_anchor(href) new_anchor = "note-{:d}".format(current_note_number) if new_anchor != old_anchor: print("Changed " + old_anchor + " to " + new_anchor + " in " + file_path) notes_changed += 1 # update the link in the soup object link["href"] = 'endnotes.xhtml#' + new_anchor link["id"] = 'noteref-{:d}'.format(current_note_number) link.string = str(current_note_number) needs_rewrite = True # now try to find this in endnotes matches = list(filter(lambda x: x.anchor == old_anchor, endnotes)) if len(matches) == 0: print("Couldn't find endnote with anchor " + old_anchor) if de_orphan: print("Removing orphan note ref in text") link.clear() needs_rewrite = True elif len(matches) > 1: print("Duplicate anchors in endnotes file for anchor " + old_anchor) else: # found a single match, which is what we want listnote = matches[0] listnote.number = current_note_number listnote.matched = True # we don't change the anchor or the back ref just yet listnote.source_file = file_name current_note_number += 1 # if we need to write back the body text file if needs_rewrite: new_file = open(file_path, "w") new_file.write(format_xhtml(str(soup))) # new_file.write(str(soup)) new_file.close() return current_note_number